From 359e965c7f3ff5a3b9c7a835cfc1bb612bb491d3 Mon Sep 17 00:00:00 2001 From: code4craft Date: Tue, 23 Apr 2013 20:58:03 +0800 Subject: [PATCH] init --- .gitignore | 2 + pom.xml | 105 + src/main/java/us/codecraft/spider/Page.java | 93 + .../java/us/codecraft/spider/Request.java | 31 + src/main/java/us/codecraft/spider/Site.java | 126 + src/main/java/us/codecraft/spider/Spider.java | 92 + .../spider/downloader/Downloader.java | 15 + .../downloader/HttpClientDownloader.java | 49 + .../spider/downloader/HttpClientPool.java | 64 + .../spider/pipeline/ConsolePipeline.java | 23 + .../spider/pipeline/FilePipeline.java | 55 + .../codecraft/spider/pipeline/Pipeline.java | 14 + .../spider/processor/PageProcessor.java | 24 + .../spider/processor/SimplePageProcessor.java | 41 + .../schedular/FileCacheQueueSchedular.java | 142 + .../spider/schedular/QueueSchedular.java | 40 + .../codecraft/spider/schedular/Schedular.java | 17 + .../us/codecraft/spider/selector/Html.java | 75 + .../codecraft/spider/selector/PlainText.java | 103 + .../spider/selector/RegexResult.java | 29 + .../spider/selector/RegexSelector.java | 82 + .../spider/selector/ReplaceSelector.java | 47 + .../codecraft/spider/selector/Selectable.java | 88 + .../codecraft/spider/selector/Selector.java | 16 + .../spider/selector/SelectorFactory.java | 82 + .../spider/selector/SmartContentSelector.java | 101 + .../spider/selector/XpathSelector.java | 69 + .../us/codecraft/spider/utils/UrlUtils.java | 94 + src/main/resources/log4j.xml | 26 + .../java/us/codecraft/spider/HtmlTest.java | 20 + .../java/us/codecraft/spider/SpiderTest.java | 132 + .../spider/samples/DiandianBlogProcessor.java | 29 + .../spider/samples/DianpingBlogProcessor.java | 33 + .../spider/samples/F58PageProcesser.java | 28 + .../spider/samples/HuxiuProcessor.java | 29 + .../spider/samples/NjuBBSProcessor.java | 28 + .../samples/OschinaBlogPageProcesser.java | 30 + .../spider/samples/OschinaPageProcesser.java | 29 + .../spider/samples/SinaBlogProcesser.java | 29 + .../spider/samples/TianyaPageProcesser.java | 28 + .../spider/selector/HtmlCleanerTest.java | 29 + .../spider/selector/RegexSelectorTest.java | 23 + .../selector/SmartConentSelectorTest.java | 3055 +++++++++++++++++ .../spider/selector/XpathSelectorTest.java | 2750 +++++++++++++++ .../codecraft/spider/utils/UrlUtilsTest.java | 647 ++++ src/test/resources/log4j.xml | 31 + 46 files changed, 8695 insertions(+) create mode 100644 .gitignore create mode 100644 pom.xml create mode 100644 src/main/java/us/codecraft/spider/Page.java create mode 100644 src/main/java/us/codecraft/spider/Request.java create mode 100644 src/main/java/us/codecraft/spider/Site.java create mode 100644 src/main/java/us/codecraft/spider/Spider.java create mode 100644 src/main/java/us/codecraft/spider/downloader/Downloader.java create mode 100644 src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java create mode 100644 src/main/java/us/codecraft/spider/downloader/HttpClientPool.java create mode 100644 src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java create mode 100644 src/main/java/us/codecraft/spider/pipeline/FilePipeline.java create mode 100644 src/main/java/us/codecraft/spider/pipeline/Pipeline.java create mode 100644 src/main/java/us/codecraft/spider/processor/PageProcessor.java create mode 100644 src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java create mode 100644 src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java create mode 100644 src/main/java/us/codecraft/spider/schedular/QueueSchedular.java create mode 100644 src/main/java/us/codecraft/spider/schedular/Schedular.java create mode 100644 src/main/java/us/codecraft/spider/selector/Html.java create mode 100644 src/main/java/us/codecraft/spider/selector/PlainText.java create mode 100644 src/main/java/us/codecraft/spider/selector/RegexResult.java create mode 100644 src/main/java/us/codecraft/spider/selector/RegexSelector.java create mode 100644 src/main/java/us/codecraft/spider/selector/ReplaceSelector.java create mode 100644 src/main/java/us/codecraft/spider/selector/Selectable.java create mode 100644 src/main/java/us/codecraft/spider/selector/Selector.java create mode 100644 src/main/java/us/codecraft/spider/selector/SelectorFactory.java create mode 100644 src/main/java/us/codecraft/spider/selector/SmartContentSelector.java create mode 100644 src/main/java/us/codecraft/spider/selector/XpathSelector.java create mode 100644 src/main/java/us/codecraft/spider/utils/UrlUtils.java create mode 100644 src/main/resources/log4j.xml create mode 100644 src/test/java/us/codecraft/spider/HtmlTest.java create mode 100644 src/test/java/us/codecraft/spider/SpiderTest.java create mode 100644 src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java create mode 100644 src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java create mode 100644 src/test/java/us/codecraft/spider/samples/F58PageProcesser.java create mode 100644 src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java create mode 100644 src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java create mode 100644 src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java create mode 100644 src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java create mode 100644 src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java create mode 100644 src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java create mode 100644 src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java create mode 100644 src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java create mode 100644 src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java create mode 100644 src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java create mode 100644 src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java create mode 100644 src/test/resources/log4j.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..0af075f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/* +*.iml diff --git a/pom.xml b/pom.xml new file mode 100644 index 00000000..c424910a --- /dev/null +++ b/pom.xml @@ -0,0 +1,105 @@ + + + us.codecraft + 0.0.1-SNAPSHOT + 4.0.0 + + webmagic + + + + org.apache.httpcomponents + httpclient + 4.2.1 + + + + junit + junit + 4.7 + test + + + + com.google.guava + guava + 13.0.1 + + + + org.apache.commons + commons-lang3 + 3.1 + + + + log4j + log4j + 1.2.17 + + + + commons-collections + commons-collections + 3.2.1 + + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + + + + org.apache.commons + commons-io + 1.3.2 + + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + + + + + + \ No newline at end of file diff --git a/src/main/java/us/codecraft/spider/Page.java b/src/main/java/us/codecraft/spider/Page.java new file mode 100644 index 00000000..1f96e58c --- /dev/null +++ b/src/main/java/us/codecraft/spider/Page.java @@ -0,0 +1,93 @@ +package us.codecraft.spider; + +import org.apache.commons.lang3.StringUtils; +import us.codecraft.spider.selector.Selectable; +import us.codecraft.spider.utils.UrlUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午11:22 + */ +public class Page { + + private Request request; + + private Map fields = new ConcurrentHashMap(); + + private Selectable html; + + private Selectable url; + + private List targetRequests = new ArrayList(); + + public void process() { + fields.put("title", html.x("").r("")); + } + + public Page() { + } + + public Map getFields() { + return fields; + } + + public void putField(String key, Selectable field) { + fields.put(key, field); + } + + public Selectable getHtml() { + return html; + } + + public void setHtml(Selectable html) { + this.html = html; + } + + public List getTargetRequests() { + return targetRequests; + } + + public void addTargetRequests(List requests) { + synchronized (targetRequests) { + for (String s : requests) { + if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { + break; + } + s = UrlUtils.fixRelativeUrl(s, url.toString()); + targetRequests.add(new Request(s)); + } + } + } + + public void addTargetRequests(String requestString) { + if (StringUtils.isBlank(requestString) || requestString.equals("#")) { + return; + } + synchronized (targetRequests) { + requestString = UrlUtils.fixRelativeUrl(requestString, url.toString()); + targetRequests.add(new Request(requestString)); + } + } + + public Selectable getUrl() { + return url; + } + + public void setUrl(Selectable url) { + this.url = url; + } + + public Request getRequest() { + return request; + } + + public void setRequest(Request request) { + this.request = request; + } +} diff --git a/src/main/java/us/codecraft/spider/Request.java b/src/main/java/us/codecraft/spider/Request.java new file mode 100644 index 00000000..4446c167 --- /dev/null +++ b/src/main/java/us/codecraft/spider/Request.java @@ -0,0 +1,31 @@ +package us.codecraft.spider; + +import us.codecraft.spider.Site; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午11:37 + */ +public class Request { + + private String url; + + private Object[] extra; + + public Request(String url, Object... extra) { + this.url = url; + this.extra = extra; + } + + public Object[] getExtra() { + return extra; + } + + public String getUrl() { + return url; + } + +} diff --git a/src/main/java/us/codecraft/spider/Site.java b/src/main/java/us/codecraft/spider/Site.java new file mode 100644 index 00000000..6f27a228 --- /dev/null +++ b/src/main/java/us/codecraft/spider/Site.java @@ -0,0 +1,126 @@ +package us.codecraft.spider; + +import java.util.HashSet; +import java.util.Set; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午12:13 + */ +public class Site { + + private String domain; + + private String userAgent; + + private String cookie; + + private String encoding; + + private String startUrl; + + private int sleepTime = 3000; + + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + + private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + + static { + DEFAULT_STATUS_CODE_SET.add(200); + } + + public static Site me() { + return new Site(); + } + + public Site setCookie(String cookie) { + this.cookie = cookie; + return this; + } + + public Site setUserAgent(String userAgent) { + this.userAgent = userAgent; + return this; + } + + public String getCookie() { + return cookie; + } + + public String getUserAgent() { + return userAgent; + } + + public String getDomain() { + return domain; + } + + public Site setDomain(String domain) { + this.domain = domain; + return this; + } + + public String getEncoding() { + return encoding; + } + + public Site setEncoding(String encoding) { + this.encoding = encoding; + return this; + } + + public Set getAcceptStatCode() { + return acceptStatCode; + } + + public Site setAcceptStatCode(Set acceptStatCode) { + this.acceptStatCode = acceptStatCode; + return this; + } + + public String getStartUrl() { + return startUrl; + } + + public Site setStartUrl(String startUrl) { + this.startUrl = startUrl; + return this; + } + + public int getSleepTime() { + return sleepTime; + } + + public Site setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Site site = (Site) o; + + if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) + return false; + if (cookie != null ? !cookie.equals(site.cookie) : site.cookie != null) return false; + if (!domain.equals(site.domain)) return false; + if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; + if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = domain.hashCode(); + result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); + result = 31 * result + (cookie != null ? cookie.hashCode() : 0); + result = 31 * result + (encoding != null ? encoding.hashCode() : 0); + result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); + return result; + } +} diff --git a/src/main/java/us/codecraft/spider/Spider.java b/src/main/java/us/codecraft/spider/Spider.java new file mode 100644 index 00000000..81ecd9de --- /dev/null +++ b/src/main/java/us/codecraft/spider/Spider.java @@ -0,0 +1,92 @@ +package us.codecraft.spider; + +import org.apache.commons.collections.CollectionUtils; +import org.apache.log4j.Logger; +import us.codecraft.spider.downloader.Downloader; +import us.codecraft.spider.downloader.HttpClientDownloader; +import us.codecraft.spider.pipeline.ConsolePipeline; +import us.codecraft.spider.pipeline.Pipeline; +import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.spider.schedular.QueueSchedular; +import us.codecraft.spider.schedular.Schedular; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午6:53 + */ +public class Spider implements Runnable { + + private Downloader downloader = new HttpClientDownloader(); + + private Pipeline pipeline = new ConsolePipeline(); + + private PageProcessor pageProcessor; + + private Schedular schedular = new QueueSchedular(); + + private Logger logger = Logger.getLogger(getClass()); + + public static Spider me() { + return new Spider(); + } + + public Spider processor(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + schedular.push(new Request(pageProcessor.getSite().getStartUrl()), pageProcessor.getSite()); + return this; + } + + public Thread thread() { + return new Thread(this); + } + + public Spider schedular(Schedular schedular) { + this.schedular = schedular; + return this; + } + + public Spider pipeline(Pipeline pipeline) { + this.pipeline = pipeline; + return this; + } + + + @Override + public void run() { + Site site = pageProcessor.getSite(); + Request request = schedular.poll(site); + while (request != null) { + Page page = downloader.download(request,site); + if (page == null) { + sleep(site.getSleepTime()); + continue; + } + pageProcessor.process(page); + addRequest(page); + pipeline.process(page,site); + sleep(site.getSleepTime()); + request = schedular.poll(site); + } + } + + private void sleep(int time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + e.printStackTrace(); + ; + } + } + + private void addRequest(Page page) { + if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { + for (Request request : page.getTargetRequests()) { + schedular.push(request,pageProcessor.getSite()); + } + } + } +} diff --git a/src/main/java/us/codecraft/spider/downloader/Downloader.java b/src/main/java/us/codecraft/spider/downloader/Downloader.java new file mode 100644 index 00000000..d20bfbb6 --- /dev/null +++ b/src/main/java/us/codecraft/spider/downloader/Downloader.java @@ -0,0 +1,15 @@ +package us.codecraft.spider.downloader; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Request; +import us.codecraft.spider.Site; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午12:14 + */ +public interface Downloader { + + public Page download(Request request,Site site); +} diff --git a/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java b/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java new file mode 100644 index 00000000..61676654 --- /dev/null +++ b/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java @@ -0,0 +1,49 @@ +package us.codecraft.spider.downloader; + +import org.apache.commons.io.IOUtils; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.log4j.Logger; +import us.codecraft.spider.Page; +import us.codecraft.spider.Request; +import us.codecraft.spider.Site; +import us.codecraft.spider.selector.Html; +import us.codecraft.spider.selector.PlainText; +import us.codecraft.spider.utils.UrlUtils; + + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午12:15 + */ +public class HttpClientDownloader implements Downloader { + + private Logger logger = Logger.getLogger(getClass()); + + @Override + public Page download(Request request, Site site) { + logger.info("downloading page " + request.getUrl()); + HttpClient httpClient = HttpClientPool.getInstance().getClient(site); + try { + HttpGet httpGet = new HttpGet(request.getUrl()); + HttpResponse httpResponse = httpClient.execute(httpGet); + int statusCode = httpResponse.getStatusLine().getStatusCode(); + if (site.getAcceptStatCode().contains(statusCode)) { + String content = IOUtils.toString(httpResponse.getEntity().getContent(), + site.getEncoding() == null ? site.getEncoding() : httpResponse.getEntity().getContentType().getValue()); + Page page = new Page(); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + return page; + } else { + logger.warn("code error " + statusCode); + } + } catch (Exception e) { + logger.warn("download page " + request.getUrl() + " error", e); + } + return null; + } +} diff --git a/src/main/java/us/codecraft/spider/downloader/HttpClientPool.java b/src/main/java/us/codecraft/spider/downloader/HttpClientPool.java new file mode 100644 index 00000000..90696a62 --- /dev/null +++ b/src/main/java/us/codecraft/spider/downloader/HttpClientPool.java @@ -0,0 +1,64 @@ +package us.codecraft.spider.downloader; + +import org.apache.http.HttpVersion; +import org.apache.http.client.HttpClient; +import org.apache.http.client.params.ClientPNames; +import org.apache.http.client.params.CookiePolicy; +import org.apache.http.conn.scheme.PlainSocketFactory; +import org.apache.http.conn.scheme.Scheme; +import org.apache.http.conn.scheme.SchemeRegistry; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.impl.conn.PoolingClientConnectionManager; +import org.apache.http.params.*; +import us.codecraft.spider.Site; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午12:29 + */ +public class HttpClientPool { + + public static final HttpClientPool INSTANCE = new HttpClientPool(5); + + public static HttpClientPool getInstance() { + return INSTANCE; + } + + private int poolSize; + + private HttpClientPool(int poolSize) { + this.poolSize = poolSize; + } + + public HttpClient getClient(Site site) { + return generateClient(site); + } + + private HttpClient generateClient(Site site) { + HttpParams params = new BasicHttpParams(); + params.setParameter(CoreProtocolPNames.USER_AGENT, site.getUserAgent()); + params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 1000); + params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 2000); + + HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); + paramsBean.setVersion(HttpVersion.HTTP_1_1); + paramsBean.setContentCharset("UTF-8"); + paramsBean.setUseExpectContinue(false); + + SchemeRegistry schemeRegistry = new SchemeRegistry(); + schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); + + PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager(schemeRegistry); + connectionManager.setMaxTotal(100); + connectionManager.setDefaultMaxPerRoute(100); + HttpClient httpClient = new DefaultHttpClient(connectionManager, params); + httpClient.getParams().setIntParameter("http.socket.timeout", 60000); + httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); + return httpClient; + } + + public void pushBack(HttpClient httpClient) { + + } +} diff --git a/src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java b/src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java new file mode 100644 index 00000000..6aa8f09c --- /dev/null +++ b/src/main/java/us/codecraft/spider/pipeline/ConsolePipeline.java @@ -0,0 +1,23 @@ +package us.codecraft.spider.pipeline; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.selector.Selectable; + +import java.util.Map; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:45 + */ +public class ConsolePipeline implements Pipeline{ + + @Override + public void process(Page page,Site site) { + System.out.println("get page: "+page.getUrl()); + for (Map.Entry entry : page.getFields().entrySet()) { + System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); + } + } +} diff --git a/src/main/java/us/codecraft/spider/pipeline/FilePipeline.java b/src/main/java/us/codecraft/spider/pipeline/FilePipeline.java new file mode 100644 index 00000000..564a9fae --- /dev/null +++ b/src/main/java/us/codecraft/spider/pipeline/FilePipeline.java @@ -0,0 +1,55 @@ +package us.codecraft.spider.pipeline; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.selector.Selectable; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Map; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午6:28 + */ +public class FilePipeline implements Pipeline { + + private String path = "/data/temp/spider/"; + + public FilePipeline(){ + + } + + public FilePipeline(String path) { + this.path = path; + } + + @Override + public void process(Page page,Site site) { + String domain = site.getDomain(); + domain = StringUtils.removeStart(domain, "http://"); + domain = StringUtils.removeStart(domain, "https://"); + domain = StringUtils.replace(domain, "/", ""); + String path = this.path + "" + domain + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdir(); + } + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); + printWriter.println("url:\t" + page.getUrl()); + for (Map.Entry entry : page.getFields().entrySet()) { + printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings()); + } + printWriter.close(); + } catch (IOException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + + } +} diff --git a/src/main/java/us/codecraft/spider/pipeline/Pipeline.java b/src/main/java/us/codecraft/spider/pipeline/Pipeline.java new file mode 100644 index 00000000..549c70df --- /dev/null +++ b/src/main/java/us/codecraft/spider/pipeline/Pipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.spider.pipeline; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:39 + */ +public interface Pipeline { + + public void process(Page page,Site site); +} diff --git a/src/main/java/us/codecraft/spider/processor/PageProcessor.java b/src/main/java/us/codecraft/spider/processor/PageProcessor.java new file mode 100644 index 00000000..b2617a91 --- /dev/null +++ b/src/main/java/us/codecraft/spider/processor/PageProcessor.java @@ -0,0 +1,24 @@ +package us.codecraft.spider.processor; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午11:42 + */ +public interface PageProcessor { + + /** + * extends the class to implements variaty spiders + * @param page + */ + public void process(Page page); + + /** + * the site the processor for + * @return + */ + public Site getSite(); +} diff --git a/src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java b/src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java new file mode 100644 index 00000000..197ca878 --- /dev/null +++ b/src/main/java/us/codecraft/spider/processor/SimplePageProcessor.java @@ -0,0 +1,41 @@ +package us.codecraft.spider.processor; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.utils.UrlUtils; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-22 + * Time: 下午9:15 + */ +public class SimplePageProcessor implements PageProcessor { + + private String urlPattern; + + private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"; + + private Site site; + + public SimplePageProcessor(String startUrl, String urlPattern) { + this.site = Site.me().setStartUrl(startUrl). + setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); + this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; + + } + + @Override + public void process(Page page) { + List requests = page.getHtml().as().rs(urlPattern).toStrings(); + page.addTargetRequests(requests); + page.putField("title", page.getHtml().x("//title")); + page.putField("content", page.getHtml().sc()); + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java b/src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java new file mode 100644 index 00000000..3f3cf3c5 --- /dev/null +++ b/src/main/java/us/codecraft/spider/schedular/FileCacheQueueSchedular.java @@ -0,0 +1,142 @@ +package us.codecraft.spider.schedular; + +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.log4j.Logger; +import us.codecraft.spider.Site; +import us.codecraft.spider.Request; + +import java.io.*; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:13 + */ +public class FileCacheQueueSchedular implements Schedular { + + private Logger logger = Logger.getLogger(getClass()); + + private String filePath = System.getProperty("java.io.tmpdir"); + + private String fileUrlAllName = ".urls.txt"; + + private Site site; + + private String fileCursor = ".cursor.txt"; + + private PrintWriter fileUrlWriter; + + private PrintWriter fileCursorWriter; + + private AtomicInteger cursor = new AtomicInteger(); + + private AtomicBoolean inited = new AtomicBoolean(false); + + private BlockingQueue queue; + + private Set urls; + + public FileCacheQueueSchedular(Site site) { + this.site = site; + } + + public FileCacheQueueSchedular(Site site, String filePath) { + this.filePath = filePath; + this.site = site; + } + + private void flush() { + fileUrlWriter.flush(); + fileCursorWriter.flush(); + } + + private void init() { + readFile(); + initWriter(); + initFlushThread(); + inited.set(true); + logger.info("init cache schedular success"); + } + + private void initFlushThread() { + Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() { + @Override + public void run() { + flush(); + } + }, 10, 10, TimeUnit.SECONDS); + } + + private void initWriter() { + try { + fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true)); + fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false)); + } catch (IOException e) { + throw new RuntimeException("init cache schedular error", e); + } + } + + private void readFile() { + try { + queue = new LinkedBlockingQueue(); + urls = new LinkedHashSet(); + readCursorFile(); + readUrlFile(); + } catch (IOException e) { + } + } + + private void readUrlFile() throws IOException { + String line; + BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName)); + int lineReaded = 0; + while ((line = fileUrlReader.readLine()) != null) { + urls.add(line.trim()); + lineReaded++; + if (lineReaded > cursor.get()) { + queue.add(new Request(line, site)); + } + } + } + + private void readCursorFile() throws IOException { + BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor)); + String line = null; + //read the last number + while ((line = fileCursorReader.readLine()) != null) { + cursor = new AtomicInteger(NumberUtils.toInt(line)); + } + } + + @Override + public synchronized void push(Request request,Site site) { + if (!inited.get()) { + init(); + } + if (logger.isDebugEnabled()) { + logger.debug("push to queue " + request.getUrl()); + } + if (urls.add(request.getUrl())) { + queue.add(request); + fileUrlWriter.println(request.getUrl()); + } + + } + + @Override + public synchronized Request poll(Site site) { + if (!inited.get()) { + init(); + } + fileCursorWriter.println(cursor.incrementAndGet()); + return queue.poll(); + } +} diff --git a/src/main/java/us/codecraft/spider/schedular/QueueSchedular.java b/src/main/java/us/codecraft/spider/schedular/QueueSchedular.java new file mode 100644 index 00000000..81829633 --- /dev/null +++ b/src/main/java/us/codecraft/spider/schedular/QueueSchedular.java @@ -0,0 +1,40 @@ +package us.codecraft.spider.schedular; + +import org.apache.log4j.Logger; +import us.codecraft.spider.Request; +import us.codecraft.spider.Site; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:13 + */ +public class QueueSchedular implements Schedular { + + private Logger logger = Logger.getLogger(getClass()); + + private BlockingQueue queue = new LinkedBlockingQueue(); + + private Set urls = new HashSet(); + + @Override + public synchronized void push(Request request,Site site) { + if (logger.isDebugEnabled()){ + logger.debug("push to queue "+request.getUrl()); + } + if (urls.add(request.getUrl())){ + queue.add(request); + } + + } + + @Override + public synchronized Request poll(Site site) { + return queue.poll(); + } +} diff --git a/src/main/java/us/codecraft/spider/schedular/Schedular.java b/src/main/java/us/codecraft/spider/schedular/Schedular.java new file mode 100644 index 00000000..246afb22 --- /dev/null +++ b/src/main/java/us/codecraft/spider/schedular/Schedular.java @@ -0,0 +1,17 @@ +package us.codecraft.spider.schedular; + +import us.codecraft.spider.Request; +import us.codecraft.spider.Site; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:12 + */ +public interface Schedular { + + public void push(Request request,Site site); + + public Request poll(Site site); + +} diff --git a/src/main/java/us/codecraft/spider/selector/Html.java b/src/main/java/us/codecraft/spider/selector/Html.java new file mode 100644 index 00000000..7bbb64a6 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/Html.java @@ -0,0 +1,75 @@ +package us.codecraft.spider.selector; + +import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:54 + */ +public class Html extends PlainText { + + public Html(List strings) { + super(strings); + } + + public Html(String text) { + super(text); + } + + @Override + public Selectable x(String xpath) { + XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); + return select(xpathSelector,strings); + } + + @Override + protected Selectable select(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + String result = selector.select(string); + if (result!=null){ + results.add(result); + } + } + return new Html(results); + } + + @Override + protected Selectable selectList(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + List result = selector.selectList(string); + results.addAll(result); + } + return new Html(results); + } + + @Override + public Selectable sc() { + SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); + return select(smartContentSelector,strings); + } + + @Override + public Selectable a() { + XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); + return select(xpathSelector,strings); + } + + @Override + public Selectable as() { + XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); + return selectList(xpathSelector,strings); + } + + @Override + public Selectable xs(String xpath) { + XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); + return selectList(xpathSelector, strings); + } + +} diff --git a/src/main/java/us/codecraft/spider/selector/PlainText.java b/src/main/java/us/codecraft/spider/selector/PlainText.java new file mode 100644 index 00000000..055cbdac --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/PlainText.java @@ -0,0 +1,103 @@ +package us.codecraft.spider.selector; + +import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:54 + */ +public class PlainText implements Selectable { + + protected List strings; + + public PlainText(List strings) { + this.strings = strings; + } + + public PlainText(String text) { + List results = new ArrayList(); + results.add(text); + this.strings = results; + } + + @Override + public Selectable x(String xpath) { + throw new UnsupportedOperationException(); + } + + @Override + public Selectable xs(String xpath) { + throw new UnsupportedOperationException(); + } + + @Override + public Selectable sc() { + throw new UnsupportedOperationException(); + } + + @Override + public Selectable a() { + throw new UnsupportedOperationException(); + } + + @Override + public Selectable as() { + throw new UnsupportedOperationException(); + } + + @Override + public Selectable r(String regex) { + RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); + return select(regexSelector, strings); + } + + @Override + public Selectable rs(String regex) { + RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); + return selectList(regexSelector, strings); + } + + protected Selectable select(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + String result = selector.select(string); + if (result!=null){ + results.add(result); + } + } + return new PlainText(results); + } + + protected Selectable selectList(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + List result = selector.selectList(string); + results.addAll(result); + } + return new PlainText(results); + } + + @Override + public Selectable rp(String regex, String replacement) { + ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); + return select(replaceSelector, strings); + } + + @Override + public List toStrings() { + return strings; + } + + @Override + public String toString() { + if (CollectionUtils.isNotEmpty(toStrings())) { + return toStrings().get(0); + } else { + return null; + } + } +} diff --git a/src/main/java/us/codecraft/spider/selector/RegexResult.java b/src/main/java/us/codecraft/spider/selector/RegexResult.java new file mode 100644 index 00000000..f3ab5852 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/RegexResult.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.selector; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:39 + */ +class RegexResult { + + private String[] groups; + + public static final RegexResult EMPTY_RESULT = new RegexResult(); + + public RegexResult() { + + } + + public RegexResult(String[] groups) { + this.groups = groups; + } + + public String get(int groupId) { + if (groups == null) { + return null; + } + return groups[groupId]; + } + +} diff --git a/src/main/java/us/codecraft/spider/selector/RegexSelector.java b/src/main/java/us/codecraft/spider/selector/RegexSelector.java new file mode 100644 index 00000000..692c45ed --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/RegexSelector.java @@ -0,0 +1,82 @@ +package us.codecraft.spider.selector; + +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:09 + */ +public class RegexSelector implements Selector { + + private String regexStr; + + private Pattern regex; + + public RegexSelector(String regexStr) { + if (StringUtils.isBlank(regexStr)){ + throw new IllegalArgumentException("regex must not be empty"); + } + if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){ + throw new IllegalArgumentException("regex must have capture group 1"); + } + this.regexStr = regexStr; + try { + regex = Pattern.compile(regexStr,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("invalid regex", e); + } + } + + @Override + public String select(String text) { + return selectGroup(text).get(1); + } + + @Override + public List selectList(String text) { + List strings=new ArrayList(); + List results = selectGroupList(text); + for (RegexResult result : results) { + strings.add(result.get(1)); + } + return strings; + } + + public RegexResult selectGroup(String text) { + Matcher matcher = regex.matcher(text); + if (matcher.find()) { + String[] groups = new String[matcher.groupCount()+1]; + for (int i = 0; i < groups.length; i++) { + groups[i] = matcher.group(i); + } + return new RegexResult(groups); + } + return RegexResult.EMPTY_RESULT; + } + + public List selectGroupList(String text) { + Matcher matcher = regex.matcher(text); + List resultList = new ArrayList(); + while (matcher.find()) { + String[] groups = new String[matcher.groupCount()+1]; + for (int i = 0; i < groups.length; i++) { + groups[i] = matcher.group(i); + } + resultList.add(new RegexResult(groups)); + } + return resultList; + } + + @Override + public String toString() { + return regexStr; + } + +} diff --git a/src/main/java/us/codecraft/spider/selector/ReplaceSelector.java b/src/main/java/us/codecraft/spider/selector/ReplaceSelector.java new file mode 100644 index 00000000..ddf887e9 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/ReplaceSelector.java @@ -0,0 +1,47 @@ +package us.codecraft.spider.selector; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:09 + */ +public class ReplaceSelector implements Selector { + + private String regexStr; + + private String replacement; + + private Pattern regex; + + public ReplaceSelector(String regexStr, String replacement) { + this.regexStr = regexStr; + this.replacement = replacement; + try { + regex = Pattern.compile(regexStr); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("invalid regex", e); + } + } + + @Override + public String select(String text) { + Matcher matcher = regex.matcher(text); + return matcher.replaceAll(replacement); + } + + @Override + public List selectList(String text) { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() { + return regexStr + "_" + replacement; + } + +} diff --git a/src/main/java/us/codecraft/spider/selector/Selectable.java b/src/main/java/us/codecraft/spider/selector/Selectable.java new file mode 100644 index 00000000..9f44c3c1 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/Selectable.java @@ -0,0 +1,88 @@ +package us.codecraft.spider.selector; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-20 + * Time: 下午7:51 + */ +public interface Selectable { + + /** + * select with xpath + * + * @param xpath + * @return new Selectable after extract + */ + public Selectable x(String xpath); + + /** + * select list with xpath + * + * @param xpath + * @return new Selectable after extract + */ + public Selectable xs(String xpath); + + /** + * select smart content with ReadAbility algorithm + * + * @return content + */ + public Selectable sc(); + + /** + * select a link + * + * @return + */ + public Selectable a(); + + /** + * select all links + * + * @return + */ + public Selectable as(); + + + /** + * select with regex + * + * @param regex + * @return new Selectable after extract + */ + public Selectable r(String regex); + + /** + * select list with regex + * + * @param regex + * @return new Selectable after extract + */ + public Selectable rs(String regex); + + /** + * replace with regex + * + * @param regex + * @param replacement + * @return new Selectable after extract + */ + public Selectable rp(String regex, String replacement); + + /** + * single string result + * + * @return single string result + */ + public String toString(); + + /** + * multi string result + * + * @return multi string result + */ + public List toStrings(); +} diff --git a/src/main/java/us/codecraft/spider/selector/Selector.java b/src/main/java/us/codecraft/spider/selector/Selector.java new file mode 100644 index 00000000..f44ed0f2 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/Selector.java @@ -0,0 +1,16 @@ +package us.codecraft.spider.selector; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-20 + * Time: 下午8:02 + */ +public interface Selector { + + public String select(String text); + + public List selectList(String text); + +} diff --git a/src/main/java/us/codecraft/spider/selector/SelectorFactory.java b/src/main/java/us/codecraft/spider/selector/SelectorFactory.java new file mode 100644 index 00000000..d4797062 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/SelectorFactory.java @@ -0,0 +1,82 @@ +package us.codecraft.spider.selector; + +import org.apache.commons.lang3.StringUtils; + +import java.lang.reflect.Constructor; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:56 + */ +public class SelectorFactory { + + private Map innerCache = new ConcurrentHashMap(); + + private static final SelectorFactory INSTATNCE = new SelectorFactory(); + + public static SelectorFactory getInstatnce() { + return INSTATNCE; + } + + public RegexSelector newRegexSelector(String regex) { + return newSelector(RegexSelector.class, regex); + } + + public ReplaceSelector newReplaceSelector(String regex, String replacement) { + return newSelector(ReplaceSelector.class, regex, replacement); + } + + public XpathSelector newXpathSelector(String xpath) { + return newSelector(XpathSelector.class, xpath); + } + + public SmartContentSelector newSmartContentSelector(){ + return newSelector(SmartContentSelector.class); + } + + public T newAndCacheSelector(Class clazz, String... param) { + String cacheKey = getCacheKey(RegexSelector.class, param); + if (innerCache.get(cacheKey) != null) { + return (T) innerCache.get(cacheKey); + } + T selector = newSelector(clazz, param); + if (selector != null) { + innerCache.put(cacheKey, selector); + } + return selector; + + } + + public T newSelector(Class clazz, String... param) { + try { + if (param.length == 0) { + Constructor constructor + = clazz.getConstructor(); + T selector = constructor.newInstance(); + return selector; + } else if (param.length == 1) { + Constructor constructor + = clazz.getConstructor(String.class); + T selector = constructor.newInstance(param[0]); + return selector; + } else if (param.length == 2) { + Constructor constructor + = clazz.getConstructor(String.class, String.class); + T selector = constructor.newInstance(param[0], param[1]); + return selector; + } else { + throw new UnsupportedOperationException(); + } + } catch (ReflectiveOperationException e) { + throw new IllegalArgumentException("init object error", e); + } + } + + private String getCacheKey(Class clazz, String... param) { + return clazz.toString() + "_" + StringUtils.join(param, "_"); + } + +} diff --git a/src/main/java/us/codecraft/spider/selector/SmartContentSelector.java b/src/main/java/us/codecraft/spider/selector/SmartContentSelector.java new file mode 100644 index 00000000..b87a0a3b --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/SmartContentSelector.java @@ -0,0 +1,101 @@ +package us.codecraft.spider.selector; + +import org.apache.log4j.Logger; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; + +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * 找到clear + * User: cairne + * Date: 13-4-21 + * Time: 下午4:42 + */ +public class SmartContentSelector implements Selector { + + private Logger logger = Logger.getLogger(getClass()); + + public SmartContentSelector() { + } + + @Override + public String select(String text) { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + if (tagNode == null) { + return null; + } + TagNode[] nodes = tagNode.getElementsByName("p", true); + TagNode[] pres = tagNode.getElementsByName("pre", true); + Map pDensityCountMap = new HashMap(); + countPdensity(nodes, pDensityCountMap); + countPdensity(pres, pDensityCountMap); + for (TagNode pre : pres) { + addCounter(pre, pDensityCountMap, 2); + } + List> sortList = new ArrayList>(); + if (pDensityCountMap.size() == 0) { + return null; + } + for (Map.Entry entry : pDensityCountMap.entrySet()) { +// if (logger.isDebugEnabled()) { +// logger.debug("p\t" + entry.getKey().getName() + "#" + entry.getKey().getAttributeByName("id") + +// "@" + entry.getKey().getAttributeByName("class") + ":" + entry.getValue()); +// } + sortList.add(entry); + } + + Collections.sort(sortList, new Comparator>() { + @Override + public int compare(Map.Entry o1, Map.Entry o2) { + Double d1 = o1.getValue(); + Double d2 = o2.getValue(); + return -d1.compareTo(d2); + } + }); + TagNode contentNode = sortList.get(0).getKey(); + if (logger.isDebugEnabled()) { + logger.debug("p\t" + contentNode.getName() + "#" + contentNode.getAttributeByName("id") + + "@" + contentNode.getAttributeByName("class")); + } + return htmlCleaner.getInnerHtml(contentNode); + } + + private void addCounter(TagNode node, Map countMap, double delta) { + Double counter = countMap.get(node); + if (counter == null) { + countMap.put(node, delta); + } else { + countMap.put(node, counter + delta); + } + } + + private static final double parentWeight = 0.7; + + private void countPdensity(TagNode[] nodes, Map pDensityCountMap) { + for (TagNode node : nodes) { + if (node == null) { + continue; + } + TagNode parent = node.getParent(); + double pDensity = 1; + while (parent != null) { + addCounter(parent, pDensityCountMap, pDensity); + parent = parent.getParent(); + pDensity = pDensity * parentWeight; + } + } + } + + private TagNode findLowestCommonParent(List tagNodes, int maxMargin, Map countMap) { + TagNode contentNode = tagNodes.get(0); + return contentNode; + } + + @Override + public List selectList(String text) { + throw new UnsupportedOperationException(); + } +} diff --git a/src/main/java/us/codecraft/spider/selector/XpathSelector.java b/src/main/java/us/codecraft/spider/selector/XpathSelector.java new file mode 100644 index 00000000..6d9a1099 --- /dev/null +++ b/src/main/java/us/codecraft/spider/selector/XpathSelector.java @@ -0,0 +1,69 @@ +package us.codecraft.spider.selector; + +import org.htmlcleaner.*; + +import java.util.ArrayList; +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午9:39 + */ +public class XpathSelector implements Selector { + + private String xpathStr; + + public XpathSelector(String xpathStr) { + this.xpathStr = xpathStr; + } + + @Override + public String select(String text) { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + if (tagNode == null) { + return null; + } + try { + Object[] objects = tagNode.evaluateXPath(xpathStr); + if (objects != null && objects.length >= 1) { + if (objects[0] instanceof TagNode) { + TagNode tagNode1 = (TagNode) objects[0]; + return htmlCleaner.getInnerHtml(tagNode1); + } else { + return objects[0].toString(); + } + } + } catch (XPatherException e) { + e.printStackTrace(); + } + return null; + } + + @Override + public List selectList(String text) { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + if (tagNode == null) { + return null; + } + List results = new ArrayList(); + try { + Object[] objects = tagNode.evaluateXPath(xpathStr); + if (objects != null && objects.length >= 1) { + for (int i = 0; i < objects.length; i++) { + if (objects[i] instanceof TagNode) { + TagNode tagNode1 = (TagNode) objects[i]; + results.add(htmlCleaner.getInnerHtml(tagNode1)); + } else { + results.add(objects[i].toString()); + } + } + } + } catch (XPatherException e) { + e.printStackTrace(); + } + return results; + } +} diff --git a/src/main/java/us/codecraft/spider/utils/UrlUtils.java b/src/main/java/us/codecraft/spider/utils/UrlUtils.java new file mode 100644 index 00000000..f2fb0363 --- /dev/null +++ b/src/main/java/us/codecraft/spider/utils/UrlUtils.java @@ -0,0 +1,94 @@ +package us.codecraft.spider.utils; + +import org.apache.commons.lang3.StringUtils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:52 + */ +public class UrlUtils { + + private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); + + public static String fixRelativeUrl(String url, String refer) { + if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { + return url; + } + if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { + return url; + } + if (StringUtils.startsWith(url, "/")) { + String host = getHost(refer); + return host + url; + } else if (!StringUtils.startsWith(url, ".")) { + refer = reversePath(refer, 1); + return refer + "/" + url; + } else { + Matcher matcher = relativePathPattern.matcher(url); + if (matcher.find()) { + int reverseDepth = matcher.group(1).length(); + refer = reversePath(refer, reverseDepth); + String substring = StringUtils.substring(url, matcher.end()); + return refer + "/" + substring; + } else { + refer = reversePath(refer, 1); + return refer + "/" + url; + } + } + } + + public static String reversePath(String url, int depth) { + int i = StringUtils.lastOrdinalIndexOf(url, "/", depth); + if (i < 10) { + url = getHost(url); + } else { + url = StringUtils.substring(url, 0, i); + } + return url; + } + + public static String getHost(String url) { + String host = url; + int i = StringUtils.ordinalIndexOf(url, "/", 3); + if (i > 0) { + host = StringUtils.substring(url, 0, i); + } + return host; + } + + private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); + + public static String removeProtocal(String url) { + return patternForProtocal.matcher(url).replaceAll(""); + } + + public static String getDomain(String url) { + String domain = removeProtocal(url); + int i = StringUtils.indexOf(domain, "/", 1); + if (i > 0) { + domain = StringUtils.substring(domain, 0, i); + } + return domain; + } + + private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + + public static String fixAllRelativeHrefs(String html, String url) { + StringBuilder stringBuilder = new StringBuilder(); + Matcher matcher = patternForHref.matcher(html); + int lastEnd = 0; + while (matcher.find()) { + stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); + stringBuilder.append(matcher.group(1)); + stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\""); + lastEnd = matcher.end(); + } + stringBuilder.append(StringUtils.substring(html, lastEnd)); + return stringBuilder.toString(); + } + +} diff --git a/src/main/resources/log4j.xml b/src/main/resources/log4j.xml new file mode 100644 index 00000000..a6630f81 --- /dev/null +++ b/src/main/resources/log4j.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/test/java/us/codecraft/spider/HtmlTest.java b/src/test/java/us/codecraft/spider/HtmlTest.java new file mode 100644 index 00000000..0612d81b --- /dev/null +++ b/src/test/java/us/codecraft/spider/HtmlTest.java @@ -0,0 +1,20 @@ +package us.codecraft.spider; + +import org.junit.Assert; +import org.junit.Test; +import us.codecraft.spider.selector.Html; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午8:42 + */ +public class HtmlTest { + + @Test + public void testRegexSelector() { + Html selectable = new Html("aaaaaaab"); + Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); + + } +} diff --git a/src/test/java/us/codecraft/spider/SpiderTest.java b/src/test/java/us/codecraft/spider/SpiderTest.java new file mode 100644 index 00000000..c23b6cea --- /dev/null +++ b/src/test/java/us/codecraft/spider/SpiderTest.java @@ -0,0 +1,132 @@ +package us.codecraft.spider; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.spider.pipeline.ConsolePipeline; +import us.codecraft.spider.pipeline.FilePipeline; +import us.codecraft.spider.processor.SimplePageProcessor; +import us.codecraft.spider.samples.DianpingBlogProcessor; +import us.codecraft.spider.samples.HuxiuProcessor; +import us.codecraft.spider.schedular.FileCacheQueueSchedular; + +/** + * User: cairne + * Date: 13-4-20 + * Time: 下午7:46 + */ +public class SpiderTest { + + + @Test + public void testSpider() throws InterruptedException { + Spider me = Spider.me().pipeline(new FilePipeline()).processor(new HuxiuProcessor()); + me.run(); + } + + @Test + public void testGlobalSpider(){ + SimplePageProcessor pageProcessor = new SimplePageProcessor("http://2012guang.diandian.com/", "http://2012guang.diandian.com/post/*"); + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). + processor(pageProcessor).thread().start(); + SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html"); + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")). + processor(pageProcessor2).run(); + + Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + + } + + @Test + public void test(){ + System.out.println(System.getProperty("java.io.tmpdir")); + } + + + @Ignore + @Test + public void languageSchema() { + + + /** + * + * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") + * title = r(""(.*)"") + * body = x("//dd[@class='w133']") + * + * site.domain = "sh.58.com" + * site.ua="" + * site.cookie="aa:bb" + * + */ + + /** + * + * + * if (page == r('') && refer(1) == 1) { + * + * type = _refer(1) + * content = _text.t().c() + * title = x("asd@asd").r("",1) + * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) + * + * body=body[r(_currentUrl).g(1)] + * tags[%] = (tags[%] + xs('')) . r('') + * + * _targetUrls.add('' + x('').r('')) + * _sourceUrls.add() + * _header.put("",""); + * _cookie.add("asdsadasdsa"); + * + * + * } + * + * _cookie.add(_cookie['']) + * + * if (page == r('') && refer(1) == 1) + * ( + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + */ + + /** + * + * + * + * + * + * + * + * + * + * + */ + + /** + * + * if (model.url('') && model.refer(1) == 1) + * ( + * + * model.set(type, model.refer(1)) + * content = t(_html) > c() + * title = x(_html, 'asd@asd') > r('',1) + * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') + * tags[%] = tags + xs('') > r('') + * model.setTargetUrl(); + * + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + * _cookie.add(_cookie['']) + * + * if (page == r('') && refer(1) == 1) + * ( + * _targetUrl = '' + x('') & r('') + * _sourceUrl = '' + * ) + * + */ + } +} diff --git a/src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java b/src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java new file mode 100644 index 00000000..c735cda3 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/DiandianBlogProcessor.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DiandianBlogProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java b/src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java new file mode 100644 index 00000000..f041a32b --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/DianpingBlogProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DianpingBlogProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + if (page.getUrl().toString().contains("shop")){ + page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().sc()); + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/F58PageProcesser.java b/src/test/java/us/codecraft/spider/samples/F58PageProcesser.java new file mode 100644 index 00000000..a5ce6cb2 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/F58PageProcesser.java @@ -0,0 +1,28 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class F58PageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title",page.getHtml().r("(.*)")); + page.putField("body",page.getHtml().x("//dd[@class='w133']")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java b/src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java new file mode 100644 index 00000000..d7d1a6e9 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/HuxiuProcessor.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class HuxiuProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java b/src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java new file mode 100644 index 00000000..fa22eedc --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/NjuBBSProcessor.java @@ -0,0 +1,28 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class NjuBBSProcessor implements PageProcessor { + @Override + public void process(Page page) { + List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + page.addTargetRequests(requests); + page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java b/src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java new file mode 100644 index 00000000..97ced9b4 --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/OschinaBlogPageProcesser.java @@ -0,0 +1,30 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class OschinaBlogPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); + page.putField("content", page.getHtml().sc()); + page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java b/src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java new file mode 100644 index 00000000..0ebaab6f --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/OschinaPageProcesser.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class OschinaPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); + page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java b/src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java new file mode 100644 index 00000000..f1de3cca --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/SinaBlogProcesser.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class SinaBlogProcesser implements PageProcessor { + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().rs("]*href=[\"']{1}(http://blog\\.sina\\.com\\.cn/s/blog_.*?)[\"']{1}").toStrings()); + page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); + page.putField("body",page.getHtml().sc()); + //x("//dd[@class='w133']") + page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); + page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); + } + + @Override + public Site getSite() { + return Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +} diff --git a/src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java b/src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java new file mode 100644 index 00000000..e39abc8c --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/TianyaPageProcesser.java @@ -0,0 +1,28 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Site; +import us.codecraft.spider.Page; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午1:48 + */ +public class TianyaPageProcesser implements PageProcessor { + + @Override + public void process(Page page) { + List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + page.addTargetRequests(strings); + page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); + page.putField("body",page.getHtml().sc()); + } + + @Override + public Site getSite() { + return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java b/src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java new file mode 100644 index 00000000..b3931ad3 --- /dev/null +++ b/src/test/java/us/codecraft/spider/selector/HtmlCleanerTest.java @@ -0,0 +1,29 @@ +package us.codecraft.spider.selector; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.junit.Test; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午10:35 + */ +public class HtmlCleanerTest { + + @Test + public void test() throws IOException { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + + CleanerProperties props = htmlCleaner.getProperties(); + + TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8"); + System.out.println(node.getAllElementsList(true)); + System.out.println(node); + } +} diff --git a/src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java b/src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java new file mode 100644 index 00000000..a53b5a92 --- /dev/null +++ b/src/test/java/us/codecraft/spider/selector/RegexSelectorTest.java @@ -0,0 +1,23 @@ +package us.codecraft.spider.selector; + +import junit.framework.Assert; +import org.junit.Test; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午7:13 + */ +public class RegexSelectorTest { + + @Test + public void testInvalidRegex() { + String regex = "\\d+("; + try { + new RegexSelector(regex); + Assert.assertNotNull(regex); + } catch (Exception e) { + + } + } +} diff --git a/src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java b/src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java new file mode 100644 index 00000000..06c56b3e --- /dev/null +++ b/src/test/java/us/codecraft/spider/selector/SmartConentSelectorTest.java @@ -0,0 +1,3055 @@ +package us.codecraft.spider.selector; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.junit.Test; + +import java.io.IOException; +import java.net.URL; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午10:35 + */ +public class SmartConentSelectorTest { + + @Test + public void test() throws IOException { + String text ="\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 全文Feed的终极解决方案 - 阮一峰的网络日志\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
阮一峰的网络日志 » 首页 » 档案\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "

分类

\n" + + " \n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "

全文Feed的终极解决方案

\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "

作者: 阮一峰

\n" + + "\n" + + "

日期: 2010年4月17日

\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + " \n" + + "

正如我们都知道的,全文Feed最有用。

\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "

但是,世界上的大部分Feed,都是摘要Feed,甚至是标题Feed。我们只好自己动手,制作全文Feed。

\n" + + "\n" + + "

传统的制作方法非常麻烦,需要针对不同的网站,编写不同的内容提取规则。要是有一个傻瓜型的\"全文Feed生成器\",把摘要Feed往里面一扔,全文Feed就自动生成了,那该多好。

\n" + + "\n" + + "

FiveFilters.org提供的生成器,大概最接近于这种要求。

\n" + + "\n" + + "

\n" + + "\n" + + "

举例来说,网易的社会新闻Feed(http://news.163.com/special/00011K6L/rss_sh.xml)是一个摘要Feed。

\n" + + "\n" + + "

\n" + + "\n" + + "

我们把这个网址,送进FiveFilters.org,点击\"Create Feed\"按钮,全文Feed就自动产生了!(查看效果

\n" + + "\n" + + "

但是,这个生成器并不是百用百灵,比如新浪的Feed(http://rss.sina.com.cn/news/society/focus15.xml)就无法抓取全文。

\n" + + "\n" + + "

好在今年3月份,它开源了。作者Keyvan Minoukadeh将所有代码都公开了,所以如果遇到不能生效的Feed,现在我们就可以修改源码了。因此理论上,几乎所有的摘要Feed都可以自动转成全文Feed了。

\n" + + "\n" + + "

源码存放在launchpad.net上,需要安装Bazaar的客户端才能下载。我为大家提供方便,把它们压缩成一个zip文件,点击下载(1.0版,217KB)。

\n" + + "\n" + + "

下载后,上传到支持PHP 5.2的虚拟主机上,就可以直接使用。使用的时候,需要将cache子目录设为可写(权限777)。在config-sample.php文件中,可以查看设置选项,修改默认值后,将文件名改为config.php,就会生效。(不修改亦可,config文件并不是必需的。)

\n" + + "\n" + + "

这个程序的核心是readability.php文件,它负责判断当前网页中,那一部分属于页面的主要内容,然后将其抓取出来。实现原理照搬了arc90的ReadAbility脚本。简单说,思路是这样的:1)检查页面中所有p元素的父容器;2)根据相关特征,为每一个父容器计算一个特征值;3)特征值最大的容器,就是放置主要内容的容器。

\n" + + "\n" + + "

具体实现请阅读代码,源码写得非常清晰,而且有详细的注释。如果遇到不能抓取全文的Feed,你就要自己修改readability.php,增加相应的规则。比如,在我提供下载的代码中,我就设置了新浪网的规则,新浪网的全文Feed就能自动生成了。

\n" + + "\n" + + "

这个程序使用的是AGPL许可证,这就是说你可以自由地使用、修改、发布这个程序,但是只要你向他人提供基于这个程序的服务,你就必须公开源码。

\n" + + "\n" + + "

作者Keyvan Minoukadeh允诺,只要使用者向他捐款200美元,就发布2.0版。如果你喜欢这个程序,建议向他捐款

\n" + + "\n" + + "

P.S.

\n" + + "\n" + + "

这几天,我还发现了一个非常优秀的开源相册软件ZenPhoto,也推荐使用。

\n" + + "\n" + + "

UPDATE(2010.6.3)

\n" + + "\n" + + "

Full TEXT RSS 1.5版下载(283KB)

\n" + + "\n" + + "

UPDATE(2010.11.10)

\n" + + "\n" + + "

Full TEXT RSS 2.1版下载(362KB)

\n" + + "\n" + + "

(完)

\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "

文档信息

\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + "
\n" + + "
\n" + + "

相关文章

\n" + + "
    \n" + + "\n" + + "
  • 2012.12.21: Javascript异步编程的4种方法\n" + + "\n" + + "
    \n" + + " 你可能知道,Javascript语言的执行环境是\"单线程\"(single thread)。\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + " \n" + + "
  • 2012.12.14: 奥巴马筹款网站的制作过程\n" + + "\n" + + "
    \n" + + " 1.\n" + + "\n" + + "Kyle Rush是一个网站工程师。\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "

功能链接

\n" + + "
    \n" + + "
  • 前一篇:\"草原新城\"康巴什
  • \n" + + "
  • 后一篇:网络时代的音乐家生存指南
  • \n" + + "
  • 更多内容请访问:首页 » 档案 » \n" + + "IT技术 \n" + + "
  • \n" + + "\n" + + "
  • \n" + + "\n" + + "
    \n" + + "\n" + + "站内搜索:\n" + + "\n" + + "\n" + + "Web\n" + + "\n" + + "www.ruanyifeng.com\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\n" + + "
  • \n" + + "
  • Feed订阅:
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "

广告(购买广告位)

\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "

留言(23条)

\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " zp\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

能不能介绍些Movable Type的文章,我比较喜欢它的静态页面,国内关于它的资料好像还不多。特别是MT5出来后,多页面功能可能会让刚接触的人晕头转向。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 火点\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

很好,谢谢作者,只是赶到花了大量的时间在新闻上似乎有点不利于思考。

\n" + + "\n" + + "

用一个图书管理软件(BLM)整理了大学期间看过的书,仅有180本左右,汗颜,这就是我的大学……

\n" + + "\n" + + "

现在参加工作了,好在业余时间还算充裕,希望可以多读一些书。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " AlbertDiao\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 野草博客\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

嗯,野草一直在用他:)

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " Ruan YiFeng\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用AlbertDiao的发言:
\n" + + "\n" + + "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + + "\n" + + "
\n" + + "\n" + + "

流量会越来越便宜,真正昂贵的是你的时间。所以还是全文Feed好。

\n" + + "\n" + + "
\n" + + "
引用zp的发言:
\n" + + "\n" + + "

能不能介绍些Movable Type的文章。

\n" + + "\n" + + "
\n" + + "\n" + + "

我有这个打算,但是文章不太好写,还需要准备。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " luops\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

昨晚测试了此订阅
\n" + + " 同时我也保留了原订阅。
\n" + + "今天发现,同样订阅了163新闻的情况下
\n" + + "全文订阅比官方订阅少了很多新闻
\n" + + "不知其他童靴有没有这样子情况

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 鲜为人志\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

呵呵~ 这样都可以啊~

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " roy_hu\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用AlbertDiao的发言:
\n" + + "\n" + + "

如果是手机RSS的话,摘要Feed比较好。一般浏览摘要,感兴趣的点进全文,这样比较节省流量。

\n" + + "\n" + + "
\n" + + "\n" + + "

我更喜欢全文博客,因为在手机上看Google Reader,自动都排好了版,而看全文的时候需要浏览器排版,没有Google Reader那样专门设计给手机的看着舒服。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " Jack\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

另外,也可以用YAHOO PIPE 和YQL来抓取全文。这样除了可以把非全文的FEED变成全文输出外,还可以处理根本没有FEED输出的网页。(不过有很多网页需要处理一下GB2312和UNICODE转换。).而且这样还有一个最大的好处,就是不用建立自己的服务器。

\n" + + "\n" + + "


\n" + + "下面两个FEED 就是用这种办法生成的。
\n" + + "http://feeds.feedburner.com/wenxuecity_news

\n" + + "\n" + + "

http://feeds.feedburner.com/boxun_headline

\n" + + "\n" + + "

可以用GOOGLE READER 来读取它们。也不失为一种间接翻越G/F/W 的办法。
\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " Ruan YiFeng\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用luops的发言:
\n" + + "\n" + + "

全文订阅比官方订阅少了很多新闻

\n" + + "\n" + + "
\n" + + "\n" + + "

全文Feed默认只有4个条目,下载代码后,你可以自己修改这个值。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 坏坏鼠\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

不懂编程只会用GR的文科生飘过~~~
\n" + + "ps:阮老师的这篇文章GR里也只是显示标题,所以漂洋过海地过来了(牛博编辑的那个频道,已经将你的博客订阅了呵O(∩_∩)O)~~

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 111\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用luops的发言:
\n" + + "\n" + + "

全文订阅比官方订阅少了很多新闻

\n" + + "\n" + + "
\n" + + "\n" + + "


\n" + + "是这样的,丢失了好多,时效性好差
\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " kuber\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.
\n" + + "另外我建议设立一个地方大家可以交流一下脚本不能处理的feed,以及修改的方法, 这样各人不用重复浪费时间了.

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 111\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

下载了lz的代码,发布到网站上,功能可用了。rss数量自己设置就好。

\n" + + "\n" + + "

杯具的是网站只有内网地址,gr不认生成的feed地址。

\n" + + "\n" + + "

只能CS订阅,不喜。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " lietlie\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

http://mrss.dokoda.jp/
\n" + + "虽然是小鬼子的网站,但是是我找到的能够全文Feed最好的在线工具了,和LZ推荐的网站相比,可以输出所有项目,而没有4条目的限制,当然也不必自己搭建服务器,日文内容很简单,如果使用的是FF或Chrome浏览器还可以利用Google的自动翻译功能将大致内容翻译为中文(FF利用Google Toolbar)——其实即使不翻译一样很容易使用。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " Ruan YiFeng\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用kuber的发言:
\n" + + "\n" + + "

想请教一下你怎么修改规则来全文输出新浪网rss的, 我也碰到几个Feed,缺省的配置不能正确处理.

\n" + + "\n" + + "
\n" + + "\n" + + "

新浪的内容容器,有一个比较怪的ID名。只要搜索这个字符串,就能提取内容了。

\n" + + "\n" + + "

最终,你还是需要读readability.php的代码,只要读懂了,我觉得任何页面都能提取。
\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 诗沐\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

哇 源码写得相当清爽啊~注释习惯很棒

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " xangd\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

有人在appspot上部署了一个python的port
\n" + + "http://andrewtrusty.appspot.com/readability/
\n" + + "这个没有4篇post的限制

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " neotrue\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

很好用,谢谢!

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " harvey\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

博主,作者把1.5版本放出来了,
\n" + + "可否再麻烦你打包一下,我bazzar一直不成功

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " Ruan YiFeng\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
引用harvey的发言:
\n" + + "\n" + + "

博主,作者把1.5版本放出来了,
\n" + + "可否再麻烦你打包一下,我bazzar一直不成功

\n" + + "\n" + + "
\n" + + "\n" + + "

已经加上去了,:-)

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " 张治国\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

博主,全文Feed默认只有4个条目,下载代码后,修改哪段代码可以改变这个值啊,config-sample.PHP中的数值吗?我是新手,希望博主指点一下,谢谢。

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + " felix\n" + + "\n" + + " 说:\n" + + " \n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + "

看不懂readability,不知道博主能否提供一下过滤页面上的干扰字符的方法
\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "
\n" + + "

我要发表看法

\n" + + "
\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "

\n" + + "

\n" + + "
\n" + + "
\n" + + "

\n" + + "

«-必填

\n" + + "
\n" + + "
\n" + + "

\n" + + "

«-必填,不公开

\n" + + "
\n" + + "
\n" + + "

\n" + + "

«-我信任你,不会填写广告链接

\n" + + "
\n" + + "
\n" + + "

\n" + + "

\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "

正在发表您的评论,请稍候

\n" + + "

\n" + + " \n" + + "\n" + + "

\n" + + "
\n" + + "\n" + + "

«- 点击按钮

\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "

联系方式 | ruanyifeng.com 2003 - 2012\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "

\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
分享按钮 \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + ""; + Html html = new Html(text); + Selectable sc = html.sc(); + System.out.println(sc); + } + + @Test + public void test2(){ + String text = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " 地球上最后的夜晚 (豆瓣)\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " 提醒\n" + + " \n" + + "
\n" + + "
\n" + + "

加载中...

\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "
\n" + + "
    \n" + + " \n" + + " \n" + + "
  • \n" + + " 豆瓣\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 读书\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 电影\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 音乐\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 同城\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 小组\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 阅读\n" + + "
  • \n" + + " \n" + + " \n" + + "
  • \n" + + " 豆瓣FM\n" + + "
  • \n" + + " \n" + + "
  • \n" + + " 更多\n" + + "
    \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
    九点
    阿尔法城
    移动应用
    \n" + + "
    \n" + + "
  • \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " 豆瓣读书\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " 搜索:\n" + + " \n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "

\n" + + " 地球上最后的夜晚\n" + + "
\n" + + "

\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \"地球上最后的夜晚\"\n" + + " \n" + + "\n" + + "
\n" + + "

\n" + + " 更新描述或封面\n" + + "

\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 原作名: Last Evenings on Earth
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 作者: \n" + + " \n" + + " [智利] 罗贝托·波拉尼奥\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 译者: \n" + + " \n" + + " 赵德明\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 出版社: 上海人民出版社
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 出版年: 2013-4-1
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 页数: 288
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 定价: 45.00元
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " 丛书: 罗贝托·波拉尼奥作品系列
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " ISBN: 9787208112025
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "

\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 8.4\n" + + " \n" + + "\n" + + " \n" + + "

\n" + + "

\n" + + " (\n" + + " \n" + + " 11人评价\n" + + " \n" + + " )\n" + + "

\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + " 45.5%
\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + " 9.1%
\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + " 18.2%
\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + " 18.2%
\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + " 9.1%
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "
\n" + + " \n" + + " 想读\n" + + " \n" + + " \n" + + " 在读\n" + + " \n" + + " \n" + + " 读过\n" + + " \n" + + "
\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " 评价: \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
    \n" + + "
  • \n" + + "  写笔记\n" + + "
  • \n" + + "\n" + + "
  • \n" + + "  写书评\n" + + "
  • \n" + + "\n" + + "
  • \n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + " 加入购书单\n" + + " 已在购书单\n" + + "
    \n" + + "
  • \n" + + "\n" + + "
  • \n" + + " \n" + + "\n" + + "\n" + + "
    \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + " 添加到豆列\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "
  • \n" + + " 分享到   \n" + + "
  • \n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + " 推荐\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 内容简介\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 作者简介\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以...

(展开全部)

\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "

罗贝托•波拉尼奥(Roberto Bolaño,1953—2003)出生于智利,父亲是卡车司机和业余拳击手,母亲在学校教授数学和统计学。1968年全家移居墨西哥。1973年波拉尼奥再次回到智利投身社会主义革命却遭到逮捕,差点被杀害。逃回墨西哥后他和好友推动了融合超现实主义、达达主义以及街头剧场的“现实以下主义”(Infrarealism)运动,意图激发拉丁美洲年轻人对生活与文学的热爱。1977年他前往欧洲,最后在西班牙波拉瓦海岸结婚定居。2003年因为肝脏功能损坏,等不到器官移植而在巴塞罗那去世,年仅五十岁。

波拉尼奥四十岁才开始写小说,作品数量却十分惊人,身后留下十部小说、四部短篇小说集以及三部诗集。1998年出版的《荒野侦探》在拉美文坛引起的轰动,不亚于三十年前《百年孤独》出版时的盛况。而其身后出版的《2666》更是引发欧美舆论压倒性好评,均致以杰作、伟大、里程碑、天才等等赞誉。苏珊•桑塔格、约翰•班维尔、科尔姆•托宾、斯蒂芬•金等众多作家对波拉尼奥赞赏有加,更有评论认为此书的出版自此将作者带至塞万提斯,斯特恩,梅尔维尔,普鲁斯特,穆齐尔与品钦的同一队列。

\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 目录\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " 圣西尼……………………………………3
\n" + + " 亨利·西蒙·勒普兰斯…………………… 27
\n" + + " 恩里克·马丁……………………………39
\n" + + " 一件文学奇事…………………… ……59
\n" + + " 通话…………………… ………………75
\n" + + " 毛毛虫…………………………………83
\n" + + " · · · · · ·\n" + + " (更多)\n" + + "
\n" + + "\n" + + "
\n" + + " 圣西尼……………………………………3
\n" + + " 亨利·西蒙·勒普兰斯…………………… 27
\n" + + " 恩里克·马丁……………………………39
\n" + + " 一件文学奇事…………………… ……59
\n" + + " 通话…………………… ………………75
\n" + + " 毛毛虫…………………………………83
\n" + + " 安妮·穆尔的生平 ……………………101
\n" + + " “小眼”席尔瓦 ………………………139
\n" + + " 戈麦斯帕拉西奥 ……………………159
\n" + + " 地球上最后的夜晚………………… 173
\n" + + " 1978 年的几天………………………205
\n" + + " 在法国和比利时闲逛…………………225
\n" + + " 牙科医生…………………… ………245
\n" + + " 邀舞卡……………………………… 273
\n" + + " · · · · · · (收起)\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " "地球上最后的夜晚"试读\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "
\n" + + "

情况是这样的:B 和B 父去阿卡普尔科度假。一大早,清晨六点,父子俩就要出发。那天夜里,B 睡在父亲家里。没梦,或者就算有梦,一睁眼也忘了。听见父亲在卫生间。向窗外望去,一片漆黑。B 不开灯,穿衣裳。等走出卧室的时候,父亲已经在桌旁看前一天的体育报纸了。早饭已经做好了。咖啡,牧场煎蛋。B 问候父亲后,走进卫生间。\n" + + "B 父的汽车是1970 年的福特野马。六点半,父子俩上车,开..

\n" + + "\n" + + "
· · · · · · (查看全部试读)
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 豆瓣成员常用的标签(共38个)\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "
罗贝托-波拉尼奥(68)   拉美文学(35)   外国文学(24)   小说(22)   智利文学(14)   波拉尼奥(10)   智利(10)   小说集(10)  
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "

丛书信息

\n" + + "
\n" + + "  罗贝托·波拉尼奥作品系列 (共6册),\n" + + "这套丛书还有\n" + + "《2666》,《荒野侦探》,《2666》,《荒野侦探》,《护身符》。
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "

\n" + + " 书评  · · · · · · \n" + + "

\n" + + " \n" + + " 我来评论这本书\n" + + "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \"DeadKennedy\"/\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + " \">\"\n" + + " \n" + + " \"<\"\n" + + "
\n" + + " 信仰的挽歌\n" + + "

\n" + + "
\n" + + " \n" + + " DeadKennedy   \n" + + " \n" + + " \n" + + "

\n" + + "
\n" + + " Elegy to Faith\n" + + "\n" + + "\n" + + "波拉诺难得的短篇集。\n" + + "\n" + + "\n" + + "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + + "\n" + + "\n" + + "比如写自身经历的:......\n" + + "\n" + + "

\n" + + " \n" + + " 2012-02-14 13:53    \n" + + " 2/2有用\n" + + " \n" + + " \n" + + " 来自 New Directions2007版\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \"DeadKennedy\"/\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + " \">\"\n" + + " \n" + + " \"<\"\n" + + "
\n" + + " 信仰的挽歌\n" + + "

\n" + + "
\n" + + " \n" + + " DeadKennedy   \n" + + " \n" + + " \n" + + "

\n" + + "
\n" + + " Elegy to Faith\n" + + "\n" + + "\n" + + "波拉诺难得的短篇集。\n" + + "\n" + + "\n" + + "比之长篇,波拉诺的短篇是其能力的代表。他的长篇像话剧台词,冗长,精彩,让人迷失其中,在读过大概三百页之后似乎明白一些他在说什么。而他的短篇则像电台DJ的串词,明了,信息丰富,基本是波拉诺的自传和自白。很多篇目就是作家自身经历的镜像。是一些关于动荡,个人自由,劳动,知识份子,流放和坚持的故事。纽约时报评论这本书为“流放民谣”。\n" + + "\n" + + "\n" + + "比如写自身经历的:......\n" + + "\n" + + "

\n" + + " \n" + + " 2012-02-14 13:53    \n" + + " 2/2有用\n" + + " \n" + + " \n" + + " 来自 New Directions2007版\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 论坛\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
平装or精装?孔亚雷or赵德明?来自Nihilum5 回应2013-04-21
書到底出了沒啊?来自阿城199114 回应2013-04-13
不是翻译问题,是根本看不懂来自呆呆双鱼女1 回应2013-04-20
\n" + + "\n" + + "\n" + + "

>\n" + + " 在这本书的论坛里发言\n" + + "

\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "

\n" + + " 在哪儿买这本书?\n" + + "

\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 加入购书单\n" + + " \n" + + " 已在购书单 \n" + + " 查看\n" + + " 删除\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " 多本比价,批量购买\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 这本书的其他版本 \n" + + "  · · · · · ·\n" + + "  (\n" + + " 全部3\n" + + " ) \n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 以下豆列推荐\n" + + "  · · · · · ·\n" + + "  (\n" + + " 全部\n" + + " ) \n" + + "\n" + + "

\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "

谁读这本书?

\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + " \n" + + "
\"小K\"
\n" + + "
小K
\n" + + "
13分钟前 想读
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + " \n" + + "
\"杰森辛普森\"
\n" + + "
杰森辛普森
\n" + + "
28分钟前 想读
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + " \n" + + "
\"Aby\"
\n" + + "
Aby
\n" + + "
37分钟前 想读
\n" + + "\n" + + "
\n" + + "\n" + + " tags:对人生的诠释\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + " \n" + + "
\"老男孩\"
\n" + + "
老男孩
\n" + + "
1小时前 想读
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "

\n" + + "
\n" + + "\n" + + "\n" + + "

> 5人在读

\n" + + "

> 12人读过

\n" + + "

> 658人想读

\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "

\n" + + "\n" + + " 喜欢这本书的人常去的小组\n" + + "  · · · · · ·\n" + + "\n" + + "

\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
\"托马斯·品钦\"/
\n" + + " \n" + + "
托马斯·品钦 (711)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"短经典\"/
\n" + + " \n" + + "
短经典 (787)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"寻找:布鲁诺.舒尔茨\"/
\n" + + " \n" + + "
寻找:布鲁诺.舒尔茨 (466)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"胡安·鲁尔福\"/
\n" + + " \n" + + "
胡安·鲁尔福 (613)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"V.S.奈保尔\"/
\n" + + " \n" + + "
V.S.奈保尔 (445)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"胡利奥·科塔萨尔\"/
\n" + + " \n" + + "
胡利奥·科塔萨尔 (1053)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"中国当代书籍装帧摭评\"/
\n" + + " \n" + + "
中国当代书籍装帧摭评 (1373)\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\"泼先生\"/
\n" + + " \n" + + "
泼先生 (485)\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

> 收藏这本书的1个小组

\n" + + "
\n" + + "

> \n" + + " 加到我的小组收藏里 \n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "

二手市场

\n" + + "
\n" + + "
    \n" + + "
  • \n" + + " > 点这儿转让\n" + + "\n" + + " 有658人想读,手里有一本闲着?\n" + + "
  • \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "

订阅关于地球上最后的夜晚的评论:
\n" + + " feed: rss 2.0

\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " © 2005-2013 douban.com, all rights reserved\n" + + "\n" + + "\n" + + "\n" + + " 关于豆瓣\n" + + " · 在豆瓣工作\n" + + " · 联系我们\n" + + " · 免责声明\n" + + " \n" + + " · 帮助中心\n" + + " · 开发者\n" + + " · 图书馆合作\n" + + " · 手机读书\n" + + " · 豆瓣广告\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n"; + + Html html = new Html(text); + System.out.println(html.sc()); + + } +} diff --git a/src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java b/src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java new file mode 100644 index 00000000..24988f7a --- /dev/null +++ b/src/test/java/us/codecraft/spider/selector/XpathSelectorTest.java @@ -0,0 +1,2750 @@ +package us.codecraft.spider.selector; + +import org.junit.Assert; +import org.junit.Test; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 上午10:06 + */ +public class XpathSelectorTest { + + String huxiuHtml = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "产品情感化设计的两个层面-观点-@虎嗅网\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t\t\t\n" + + "\t\n" + + "
\n" + + "

\"虎嗅网\"

\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "订阅虎嗅\n" + + "RSS\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "

产品情感化设计的两个层面

\n" + + "\n" + + "
\n" + + "
\n" + + " 2013-4-22 16:10\n" + + " \n" + + " \t评论(0)\n" + + " \n" + + "产品\n" + + "投稿\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + " \"\"\n" + + " 用户之所以选择一款产品,首要的一点在于产品的功能或内容满足了用户。而随着产品的发展,同类型的产品基础功能都大致相同,产品之间的竞争越来越难在功能层面拉开差距。现在产品人员也更加开始在用户体验上下功夫了,而对用户体验的不断追求也就上升到了情感层面。

谈起产品情感化设计,可以拿手机通讯录中添加联系人头像来举例子,单就这个功能点而言,最基础的只要用户能够添加联系人的头像即可,而如果在这个功能上添加用户情感化的元素后,就可以在用户的头像展示上给予更大空间,让用户能够更大的发挥自己的个性。我们也发现新浪微博和开心网个人主页的设计也都增加了个人封面的展示。产品情感化对于功能本身是没有影响的,而情感因素后,产品对用户还会更有吸引力。短期来看,个性化和给用户更大的发挥空间是产品情感化设计的两个很重要的方向。

产品的情感化设计有两个不同的做法:一个是在已有功能上进行扩展,如上文所提到的通讯录中上传头像的功能,是对用户表达欲的满足,用户情感的单向表达;另一种做法则是做一个完全情感化的产品,用户情感的双向表达,是用户之间情感内容的交流,产品扮演的只是桥梁作用,例如小恩爱、抬杠这样的产品。其实所有涉及到用户互动性的产品对于情感化的拓展空间都很大,但是与普通社交不同的是,产品的情感化在于人与人之间更深层次的交流。在我个人看来,社交网站中的发状态功能已经仅仅是用户表达的工具,极少含有感情因素,但是像Facebook推出的暗恋功能却是一个情感化产品,产品的情感化不仅在于让用户将自己的情感寄予到产品中,而且产品要想具有情感化很重要的一点在于产品本身能够起到挖掘用户情感的作用。

前面所提到的两种做法区别在于,前者是基于已有需求而进行的情感化设计,而后者则是完全情感化的产品,就成功率来讲,显然是前者更大一些。本身有需求的产品对于产品的情感化发展不仅奠定了基础,而且也烘托了氛围,做好了铺垫。如果是做一个完全情感化的产品,失败的可能性很大。当产品的功能满足了用户的情感表达,那就意味着产品可以满足用户的需求,而当产品本身所扮演的角色无法成为用户的寄托,那么产品就会面临失败。可想而知,情感化的产品肯定属于UGC类型,对于用户内容的质量要求会比较高,当技术水平不够高、功能操作不够便捷的时候,自然就提高了使用门槛。而且这种类型的产品对于氛围的烘托本身就会有相对高的要求。

如果单从功能角度去衡量,用户情感的单向表达属于功能层面,而用户情感的双向表达属于内容层面。除此之外,产品情感化还有文案和产品风格上的表现。

你是一个资深网虫,或许你也有所感觉,现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。这只是其中的一种表达方式,除此之外,你会看到产品设计中的很多引导方式也更有趣味性,文案内容的情感化也会增加用户的接受程度。

最近自己在使用产品中也有个很大的感触,就是产品风格对用户的吸引,同样是天气类应用,功能上相差无几,但是不同的风格却可以吸引不同的受众。有的是大众普通的风格,有的是小清新风格,有的是卡通风格等等,可以理解为用户对不同风格产品的选择背后的原因就是用户个人情感的不同,而用户的这种情感不能改变只能顺从。

更深层次的讲,产品情感化的关键在于产品功能与用户情感的承接,满足人们情感的诉求。从心理学上讲人的本性有很多,例如表达欲、攀比心理,但从人的本性和产品的情感化进行匹配,会有太多的点,在这里就不一一例举了,大家可以在产品的使用过程中逐渐感受。而之所以要选择利用人性情感的哪一点来设计产品就要根据具体的产品目标来衡量了。

文章来源:马虎眼    作者微信账号:mahuyan


本文由\n" + + "云瑞\n" + + "授权虎嗅网发表,并经虎嗅网编辑。转载此文章须经作者同意,并请附上出处(虎嗅网)及本页链接。
原文链接http://www.huxiu.com/article/13380/1.html\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " 分享(0):\n" + + "
\n" + + "
\n" + + "
    \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + "
  • \n" + + " \n" + + "
  • \n" + + "
\n" + + " \n" + + "
\n" + + "
收藏\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " 没劲 \n" + + " 喜欢 \n" + + "
\t\t \n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "

参与讨论,请先登录|注册

\n" + + "

\n" + + "\n" + + "\n" + + "

\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t\t\t
\n" + + "

作者:云瑞

\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
个人签名
\n" + + "
人人都爱互联网
\n" + + "
\n" + + "
\n" + + "\t\t\t\t\n" + + "

作者其他文章

\n" + + "\n" + + " \n" + + "更多文章\n" + + "
\n" + + "\n" + + "
\n" + + "

您不能错过的作者

\n" + + "
  • \n" + + "

    \"葛甲\"

    \n" + + "

    葛甲

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"吴澍\"

    \n" + + "

    吴澍

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"知乎精选\"

    \n" + + "

    知乎精选

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"译言\"

    \n" + + "

    译言

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"潘乱\"

    \n" + + "

    潘乱

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"王云辉\"

    \n" + + "

    王云辉

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"阑夕\"

    \n" + + "

    阑夕

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"胡晓东\"

    \n" + + "

    胡晓东

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"阳淼\"

    \n" + + "

    阳淼

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"魏武挥\"

    \n" + + "

    魏武挥

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"高低买个皮夹克\"

    \n" + + "

    高低买个皮夹克

    \n" + + "
  • \n" + + "
  • \n" + + "

    \"潘越飞\"

    \n" + + "

    潘越飞

    \n" + + "
  • \n" + + "\t\n" + + "
\n" + + "
\n" + + "\n" + + " \t\t\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "

关于我们|加入我们|广告及服务|常见问题解答|提交建议\n" + + "\n" + + "

\n" + + "

Copyright © 虎嗅网\n" + + "( 京ICP备12013432 )

\n" + + "
\n" + + "
\n" + + "\n" + + " 
\n" + + "\n" + + "回顶部\n" + + "\n" + + "\t\t\t
\n" + + "\t\t\t\n" + + "\t\t\t\n" + + "\n"; + + String blogHtml = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " 一个基于Python装饰器的用户输入验证设计方案 - SamChi的个人空间 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + " \t\t
JetBrains 开发工具全场3折,详情»
\n" + + "
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 我的空间 | 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t \"SamChi\"\n" + + " \n" + + " SamChi\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t\t\t\t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(21)\n" + + " \t粉丝(52)\n" + + " \t积分(37)\n" + + "
\n" + + "
\n" + + "
\n" + + "这个人很懒,啥也没写
\n" + + "\n" + + "
\n" + + "\t.发送留言\n" + + "\t.请教问题\n" + + "
\n" + + " 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@其斤君羊:说的很对 做什么事情都得从身边做起 更何况创业 ...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@techstan:不错\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@摩云飞:谢谢博主的总结,很有价值\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@尚楠:正在学Python,谢了\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@knightuniverse:其实我觉得,很多时候,不论是做项目还是做产品,...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@moyun:顶一个\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@SamChi:引用来自“Martinium”的评论 alert('I am admi...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@Martinium:alert('I am admin, bitch!'); 这句话亮了。...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@Ben:引用来自“ExtremeTalk”的评论 引用来自“Ben”...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@ExtremeTalk:引用来自“Ben”的评论 引用来自“ExtremeTalk”...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 3
  • \n" + + "
  • 33
  • \n" + + "
  • 36
  • \n" + + "
  • 842
  • \n" + + "
  • 13706
  • \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

一个基于Python装饰器的用户输入验证设计方案

\n" + + "
\n" + + " \t\t \t\t \t \n" + + "\t\t\t\t\n" + + "\n" + + "8人收藏此文章,\n" + + "\n" + + "\t\n" + + "\t\t\t\n" + + " \t\t \t\t发表于7天前(2013-04-15 16:46) , \n" + + " \t\t已有127次阅读 ,共0个评论\n" + + " \t\t \t
\n" + + "
\n" + + "\t

情景

\n" + + "

最近初学Python, 语法大概熟悉了之后就开始拿web.py做点小东西,web.py非常轻量,用起来感觉很舒服。但不过无论什么语言或者框架,web开发中有一个最大烦人之处就是表单验证,web.py提供了web.form来进行表单验证的统一处理,这个东西虽然用起来很简单,但是感觉还是不太合心意,首先这套验证机制跟web.py框架耦合的程度太高,而自己的架构是这样的,业务逻辑跟web逻辑完全分离,web仅仅是交互形式的一种,即使添加客户端C/S形式的服务或者是向开发者提供API,业务逻辑也是完全可用,不需要修改,这样对用户输入的验证是属于业务逻辑这一块,不应该跟web表单耦合在一起;另外感觉web.py这套东西还是有些简单,只支持每个表单的正则验证和最后表单提交的整体验证,而很多时候可能需要对用户进行丰富的错误提示,比如针对用户名的错误会具体到是不能为空还是长度错误或者格式错误等, 这个用web.py的form验证就感觉很别扭了。于是就决定自己设计一个用户输入的验证方案。

\n" + + "

设计

\n" + + "

web项目的开发多数都是遵循这么一个结构的设计,即DAO->Service->Controller->View, 按我前面说的,对用户的输入验证应是发生在Service这一层上,这一层的设计是接受用户输入的参数,然后进行验证处理,再进行业务相关的计算,最后输入结果。每个Service接口都应该返回一个结果,我一般都会把这个结果的内容抽象成一个一致类型的对象:

\n" + + "
class Result(object):\n" +
+            "    \n" +
+            "    u''' 操作结果抽象 '''\n" +
+            "    \n" +
+            "    def __init__(self, code, value=None):\n" +
+            "        self.code = code   #操作结果代号\n" +
+            "        self.value = value #操作结果值\n" +
+            "        \n" +
+            "    def __str__(self):\n" +
+            "        return "operation result, code: %s, value: %s" % (self.code, self.value)
\n" + + "

这个结果对象包含两个属性,一个是操作结果的代码,一个是操作的值,举个例子,比如用户注册的接口,如果注册成功,那么就会返回一个这样的Result对象,code属性是'success', value属性是新注册用户分配的ID,如果用户名已经被占用,那么code属性就是'username_exised', value属性的值是None。客户端拿到code属性的值可以做响应的处理,如果是直接面向最终用户的web应用,那么就会去找到这个code对应的错误信息来展示给用户,所有的错误信息我是组织在一个单独的Python模块中(opresult.py):

\n" + + "
reg = {\n" +
+            "       'success':u'注册成功',\n" +
+            "       'username_empty':u'用户名不得为空',\n" +
+            "       'username_format':u'用户名必须只能有数字、字母下划线组成',\n" +
+            "       'username_length':u'用户名长度必须在5到10个字符之间',\n" +
+            "       'username_existed':u'用户名已经存在',\n" +
+            "       'password_empty':u'密码不得为空',\n" +
+            "       'repassword_error':u'两次密码输入不一致',       \n" +
+            "       }
reg是注册的接口名称,这样客户端通过接口名称和code就可以获取对应的提示。 \n" + + "

由此,用户输入验证就是要把接口参数同这些code联系起来。对于参数验证,Python有天生的语言优势,那就是装饰器。一开始就想到了使用装饰器来描述参数验证需求,但这个装饰器需要哪些信息?怎么个形式?这个得从表单验证的需求开始看起,个人总结表单验证大抵不过这些判断条件:

\n" + + "

1. 是否允许为空

\n" + + "

2. 长度限制:比如密码的长度一般会不允许少于多少位

\n" + + "

3. 格式限制:比如Email地址,需要正则判断

\n" + + "

4. 逻辑限制:比如注册时判断用户名是否已经存在

\n" + + "

初步根据这些判断条件设计出这么一个方案:

\n" + + "
@checkarg(username={'allow_empty':False, \n" +
+            "                    'regex':r'^[a-zA-Z\\d_]+$',\n" +
+            "                    'min-length':5, 'max-length':10, \n" +
+            "                    'check_logic':[check_username_usable]},\n" +
+            "          password={'allow_empty':False,'regex':r'.{6,}'},\n" +
+            "          repassword={'allow-empty':False, 'check_logic':\n" +
+            "                      [(lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"))]})\n" +
+            "def reg(username, password, repassword):\n" +
+            "    ....
\n" + + "

每一个参数使用一个字典来描述验证信息, allow_empty是表示是否为空,regex为验证的正则表达式,min-length和max-length用来描述长度,check_logic用来配置其他的验证逻辑。然后如何把这些验证结果同code进行匹配呢?最开始是在这个验证信息的字典中有一项'code':{'allow_empty':'username_empty'}通过这样的形式去匹配错误提示,但是感觉这样整的这个参数太复杂了(感觉现在已经挺复杂了- -b),于是决定这个地方使用约定优于配置的形式,code的值为'参数名_错误类型'的形式,比如allow_empty如果验证了为空,那么会自动返回名为username_empty的code,如果是一些额外的处理逻辑呢?没法做约定,怎么办?那么就约定这些检测函数返回一个元组,第一个元素为一个bool值,表示成功失败,第二个参数为code,表示失败原因,比如判断两次密码是否输入一致的那个lambda:

\n" + + "
lambda **kw:(kw['password'] == kw['repassword'], "repassword_error"
\n" + + "

嗯,大体就是这样的一个设计。

\n" + + "

实现

\n" + + "

根据上面的设计,把最终的装饰器实现了出来, 逻辑比较简单,关于装饰器设计的一些细节可以参阅Python参考手册:

\n" + + "
regex_cache = {}\n" +
+            "     \n" +
+            "def checkarg(**args):\n" +
+            "    \n" +
+            "    u'''参数检测装饰器'''\n" +
+            "    \n" +
+            "    def _checkarg(function):\n" +
+            "        \n" +
+            "        def __checkarg(**func_kw):\n" +
+            "            for key in func_kw:\n" +
+            "                if key in args:\n" +
+            "                    \n" +
+            "                    #要验证的值\n" +
+            "                    value = func_kw[key]\n" +
+            "                    \n" +
+            "                    #验证规则\n" +
+            "                    valid_rules = args[key]\n" +
+            "                    \n" +
+            "                    #检测空\n" +
+            "                    allow_empty = valid_rules.get('allow_empty')\n" +
+            "                    if not allow_empty:\n" +
+            "                        if not value or not value.strip():\n" +
+            "                            return Result(key + "_empty")\n" +
+            "                    elif not value:\n" +
+            "                        #如果是空的并且忽略空检测,那么下面的就不需要检查了\n" +
+            "                        continue;\n" +
+            "                    \n" +
+            "                    #检测长度\n" +
+            "                    if 'min-length' in valid_rules:\n" +
+            "                        min_length = valid_rules['min-length']\n" +
+            "                        if min_length > len(value):\n" +
+            "                            return Result(key + "_length")\n" +
+            "                        \n" +
+            "                    if 'max-length' in valid_rules:\n" +
+            "                        max_length = valid_rules['max-length']\n" +
+            "                        if max_length < len(value):\n" +
+            "                            return Result(key + "_length")\n" +
+            "                    \n" +
+            "                    #检测正则\n" +
+            "                    if 'regex' in valid_rules:\n" +
+            "                        #获取编译后的正则\n" +
+            "                        regex = valid_rules['regex']\n" +
+            "                        regexcmp = regex_cache.get(regex)\n" +
+            "                        if not regexcmp:\n" +
+            "                            regexcmp = re.compile(regex)\n" +
+            "                            regex_cache[regex] = regexcmp\n" +
+            "                        if not regexcmp.search(value):\n" +
+            "                            return Result(key + "_format")\n" +
+            "                    \n" +
+            "                    #检测其他逻辑\n" +
+            "                    check_logics = valid_rules.get('check_logic')\n" +
+            "                    if check_logics:\n" +
+            "                        for logic in check_logics:\n" +
+            "                            result, code = logic(**func_kw)\n" +
+            "                            if not result:\n" +
+            "                                return Result(code)\n" +
+            "                                \n" +
+            "            function(**func_kw)\n" +
+            "        return __checkarg\n" +
+            "                            \n" +
+            "    return _checkarg
\n" + + "\t \t \n" + + "
\n" + + "\t\t\n" + + "
\n" + + "\t \t\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 0 条网友评论

\n" + + "\t\t\t

尚无网友评论

\n" + + "\t\t
    \n" + + "\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; + + String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " 再次吐槽easyui - 开源中国 OSChina.NET\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "
\n" + + "\t
\n" + + "
\n" + + " \t\n" + + "
\n" + + "\t\t
\n" + + " \t\t \t\t黄亿华,您好 \n" + + "\t\t\t\n" + + "\t\t\t\t我的空间\n" + + "\t\t\t\t\n" + + "\t\t\t | \n" + + "\t\t\t添加软件 | 投递新闻 | 退出\n" + + " \t\t\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "
\n" + + "
\n" + + "

讨论区

\n" + + "
\n" + + "\t
当前位置:
\n" + + "\t
\n" + + "\t\t\t\t\t \t\t讨论区 »\n" + + " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + + "\t\t\t\t\t\t\t\t\t\t
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\"午后冬日\"
\n" + + "\t\t
\n" + + "\t\t\t

再次吐槽easyui

\n" + + "\t\t\t
\n" + + "\t\t\t\t午后冬日\n" + + "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + + "\t\t\t\t3回/289阅,\n" + + "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
\n" + + "\t\t
\n" + + "\t\t\n" + + "\t\t
\n" + + "\t
\n" + + "\t\t \t \t\t\t\t\t\n" + + "\t\t

Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

\n" + + "\t\t
\n" + + "\t\t\t\t\t\t
刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + + "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
\n" + + "\t\t\t\t\t\t
\n" + + "\t\t\t\t标签:\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
\n" + + "\t\t\t\t\t\t
\n" + + "\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + + "\t\t\t\t\t\t\n" + + "\t\t\t共0个人想要问同样的问题\n" + + "\t\t\t\t\t\t补充话题说明»\n" + + "\t\t\t
\n" + + "\t\t\t\t\t\t
\n" + + "\t
    \n" + + "
    \t\t
    \n" + + "\t\t\n" + + "\t\t
    \n" + + " \n" + + "\t\t\t\t
    \n" + + "\t\t\t
    分享到
    \n" + + "\t\t\t\n" + + "\t\t\t
    1
    \n" + + "\t\t\t\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t\t|\n" + + "\t\t\t\t\t\t\t\t \t\t\t\n" + + "\t\t\t\t\t\t\t\t0\n" + + "\t\t\t
    \n" + + "\t\t\t\n" + + "\t\t
    \n" + + "\t\t
    \n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t\n" + + " \t

    \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t按评价排序 |\n" + + "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + + "\t\t\t\t\n" + + "\t\t\t\t共有3个答案 我要回答»\n" + + "\t\t\t

    \n" + + "\t\t\t \t
    • \n" + + "\t
      \"布谷鸟\"
      \n" + + "\t
      \n" + + "\t\t
      布谷鸟 回答于 2013-04-21 09:28
      \t\t\n" + + " \t
      \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
      \n" + + "\t\t
      \n" + + "\t\t
      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      \n" + + "\t
      \n" + + "\t
      \n" + + "\t
      --- 共有 1 条评论 --- \n" + + "
        \n" + + "\t\t
      • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
        \n" + + "\t
      • \n" + + "\t
      \n" + + "\n" + + "
      \n" + + "\t
      \t\t\t\t\t\t有帮助(1) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
      \n" + + "
    • \n" + + "\t
      \"静风流云\"
      \n" + + "\t
      \n" + + "\t\t
      静风流云 回答于 2013-04-21 11:08
      \t\t\n" + + " \t
      \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
      \n" + + "\t\t
      \n" + + "\t\t

      没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
      幸亏来了一个厉害的前端,解决问题,够用就好。

      \n" + + "\t
      \n" + + "\t
      \n" + + "\t
      --- 共有 1 条评论 --- \n" + + "
        \n" + + "\t\t
      • \n" + + "\t\t\"午后冬日\"\n" + + "\t\t\n" + + "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + + "\t\t(4小时前 by 午后冬日)\n" + + "\t\t回复\n" + + "\t\t\n" + + "\t\t
        \n" + + "\t
      • \n" + + "\t
      \n" + + "\n" + + "
      \n" + + "\t
      \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(1) |\n" + + " \t引用此答案\t
      \n" + + "
    • \n" + + "\t
      \"布谷鸟\"
      \n" + + "\t
      \n" + + "\t\t
      布谷鸟 回答于 2013-04-21 11:29
      \t\t\n" + + " \t
      \n" + + "\t\t\t \t\t \t\t举报\n" + + " \t
      \n" + + "\t\t
      \n" + + "\t\t

      引用来自“布谷鸟”的答案

      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
      \n" + + "\t
      \n" + + "\t
      \n" + + "\t
      \n" + + "\t
      \t\t\t\t\t\t有帮助(0) |\n" + + "\t\t没帮助(0) |\n" + + "\t\t评论(0) |\n" + + " \t引用此答案\t
      \n" + + "
    \n" + + "\t\t\t\t
    \n" + + "\t\t
    \n" + + "\t\t\t
    \"黄亿华\"
    \n" + + "\t\t\t
    \n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t

    \n" + + "\t\t\t\t回答案顶部 | 回页面顶部\n" + + "\t\t\t
    \n" + + "\t\t\t
    \n" + + "\t\t\t\n" + + "\t\t
    \n" + + "\t
    \t\n" + + "\t\n" + + "\n" + + "\n" + + "\n" + + "\t
    \n" + + "\t
    \n" + + " \t\n" + + "\t
    \n" + + "\t\t
    \n" + + "\t\t\t有什么技术问题吗?\n" + + "\t\t\t我要提问\n" + + "\t\t\t
    \n" + + "\t\t
    \n" + + "\t\t\n" + + "\t\t\t\t\t\t
    \n" + + "\t\t\t全部(29)...午后冬日的其他问题\n" + + "\t\t\t\n" + + "\t\t
    \n" + + "\t\t\t\t
    \n" + + "\t\t\n" + + "\t\t
    \n" + + "\t\t\n" + + "\t\t
    \n" + + "\t\t\t类似的话题\n" + + "\t\t\t\n" + + "\t\t
    \n" + + "\t
    \n" + + "\t
    \n" + + "
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\t
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "\n" + + ""; + + @Test + public void test(){ + String text = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " jsoup 解析页面商品信息 - - ITeye技术网站\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + " 首页\n" + + " 资讯\n" + + " 精华\n" + + " 论坛\n" + + " 问答\n" + + " 博客\n" + + " 专栏\n" + + " 群组\n" + + " 更多 \n" + + "
    \n" + + " 招聘\n" + + " 搜索\n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + " 欢迎flashsword20\n" + + " 0\n" + + " \n" + + " \"Newpm\"收件箱(3)\n" + + " \n" + + " 我的应用\n" + + "
    \n" + + " 我的关注\n" + + " 我的群组\n" + + " 我的简历\n" + + " 我的相册\n" + + " 我的收藏\n" + + " 我的代码\n" + + " 我的微博\n" + + "
    \n" + + " 我的博客\n" + + " 设置\n" + + "
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + " \n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
    \n" + + "
    \n" + + "

    \n" + + " jsoup 解析页面商品信息\n" + + " \n" + + "

    \n" + + " \n" + + "
     
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "

    今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

    \n" + + "

    \n" + + "

    下面就记录一下:

    \n" + + "

    一、首先去 http://jsoup.org/download 下载jsoup的jar包。

    \n" + + "

    \n" + + "

    二、下面记录下相关代码:

    \n" + + "

    \n" + + "

    \n" + + "

    Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

    \n" + + "


    doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

    \n" + + "

    \n" + + "

    并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

    \n" + + "

    \n" + + "

    doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

    \n" + + "

    下的值。

    \n" + + "

    \n" + + "

    doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

    \n" + + "

    \n" + + "

    doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

    \n" + + "

    \n" + + "

    含有‘品牌’td的下一个td标签中内容。

    \n" + + "

    \n" + + "

    doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

    \n" + + "

    \n" + + "

    type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

    \n" + + "

    \n" + + "

    <script src=\"url\" type=\"text/javascript\"></script>。

    \n" + + "
    \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + "
    \n" + + " \n" + + "
    分享到:\n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + " \n" + + "
    \n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    评论
    \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    发表评论
    \n" + + "
    \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "

    (快捷键 Alt+S / Ctrl+Enter)

    \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"masong1987的博客\"
    \n" + + "
    masong1987
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
      \n" + + "
    • 浏览: 5401 次
    • \n" + + "
    • 性别: \"Icon_minigender_1\"
    • \n" + + "
    • 来自: 北京
    • \n" + + "
    • \n" + + " \n" + + "
    • \n" + + " 发短消息\n" + + " \n" + + " 更多访客>>\n" + + " \n" + + "
      \n" + + "
      \"flashsword20的博客\"
      \n" + + " \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \"dylinshi126的博客\"
      \n" + + " \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \"machoo的博客\"
      \n" + + " \n" + + "
      \n" + + " \n" + + "
      \n" + + "
      \"arson的博客\"
      \n" + + " \n" + + "
      \n" + + " \n" + + "
    \n" + + "\n" + + " \n" + + "\n" + + "
    \n" + + "
    文章分类
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    社区版块
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    存档分类
    \n" + + " \n" + + "
    \n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "
    最新评论
    \n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
    \n" + + " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + + "
    \n" + + "
    \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n"; + String text2="
    aaa
    "; + XpathSelector xpathSelector = new XpathSelector("//div[@id='main']/div[@class='blog_main']/div[1][@class='blog_title']/h3/a"); + String select = xpathSelector.select(text); + Assert.assertEquals("jsoup 解析页面商品信息",select); + } + + @Test + public void testOschina(){ + Html html1 = new Html(html); + Assert.assertEquals("再次吐槽easyui",html1.x(".//*[@class='QTitle']/h1/a").toString()); + } + + @Test + public void testOschinaBlog(){ + Html html1 = new Html(blogHtml); + System.out.println(html1.sc()); + } + + @Test + public void testHuxiuBlog(){ + Html html1 = new Html(huxiuHtml); + System.out.println(html1.sc()); + } +} diff --git a/src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java b/src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java new file mode 100644 index 00000000..305bad7a --- /dev/null +++ b/src/test/java/us/codecraft/spider/utils/UrlUtilsTest.java @@ -0,0 +1,647 @@ +package us.codecraft.spider.utils; + +import org.junit.Assert; +import org.junit.Test; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午2:22 + */ +public class UrlUtilsTest { + + @Test + public void testFixRelativeUrl() { + String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com"); + System.out.println("fix: " + fixrelativeurl); + Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); + + fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); + + fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); + fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); + fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); +// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); +// System.out.println("fix: " + fixrelativeurl); +// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com/"); +// System.out.println("fix: " + fixrelativeurl); + } + + @Test + public void testFixRelativeHtml(){ + String html = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "虎嗅网\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "\t\t\t\n" + + "\t\n" + + "
    \n" + + "

    \"虎嗅网\"

    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "订阅虎嗅\n" + + "RSS\n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "\n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "

    震后48小时,互联网公司行动启示录

    \n" + + "

    在公益产品开发上,互联网合作开放共享一面应得体现,商业竞争一面则应被冲淡

    \n" + + " \"震后48小时,互联网公司行动启示录\"\n" + + "
    \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"李经纬逝世,围绕他展开的一个时代和三个男人\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"今日嗅评:一切都应在灾难面前握手言和\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"娜拉出走以后怎么办?读《从理想主义到经验主义》,向自由致敬\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"【每日移动观察】Kindle手机可期?\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"地震后,为什么手机不通微信通?\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 地震后,为什么手机不通微信通?

    • \n" + + "
    • 北京晨报 发表于 2013-04-22 07:23
    • \n" + + "
    • 微信的工作原理是分组交换的业务模式。它经过压缩处理,占用的通道可宽可窄,信息可以一站站推送,有传输空间时再送出。在同等网络条件下,微信占用的网络资源要小得多
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"施密特:国家是具有垄断地位的服务提供商\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 施密特:国家是具有垄断地位的服务提供商

    • \n" + + "
    • Guardian 发表于 2013-04-22 15:51
    • \n" + + "
    • 国家提供了统一的规则。国家会制定实体政策和虚拟政策,这种二元性——网络空间实施一种战略,实体空间又部署另外一种战略——是可能的。
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"库克位子成疑。这就是华尔街\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 库克位子成疑。这就是华尔街

    • \n" + + "
    • Hotashang 发表于 2013-04-22 14:23
    • \n" + + "
    • 华尔街就是这样的一个地方,在长期投资预期与短视利润之间纠结。而作为公司的CEO,也会经常在这种长期投资预期与短视利润之间摇摆
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"跨国公司全球声誉排名:苹果跌破前十,宝马高居榜首\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"我为什么说生鲜电商是个伪命题?\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 我为什么说生鲜电商是个伪命题?

    • \n" + + "
    • 独自等待 发表于 2013-04-22 11:58
    • \n" + + "
    • 我也一直很期待这个领域能有一个大师兄,既牵着马,又挑着担,还能吓退妖怪,完成阿什顿伊顿般十项全能然后华丽转身,但,仅仅也就是期待罢了
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"怎么估算雅安芦山县地震的经济影响?422.6亿元\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"8个月,2500亿美元,蒂姆・库克一露面股票就跌\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 8个月,2500亿美元,蒂姆・库克一露面股票就跌

    • \n" + + "
    • 虎嗅 发表于 2013-04-22 11:29
    • \n" + + "
    • 苹果是散户投资者最喜欢的个股,分析师指出苹果的增长陷入停顿才是大问题。市场在用自己的方式和蒂姆・库克进行沟通,每次他公开发表观点,股价就下跌,无一幸免。
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"为什么19楼你学不会?\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 为什么19楼你学不会?

    • \n" + + "
    • 周宁 发表于 2013-04-22 10:06
    • \n" + + "
    • ①一句报网互动不能解决问题;②产品必须要有自身特色;③不要随意去学别人的定位;④19楼的盈利模式不好学;⑤19楼的团队你学不会!
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"iCar?很难指望了!可是苹果对汽车仍有野望\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"网络时代,重大突发新闻报道为何一错再错?\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 网络时代,重大突发新闻报道为何一错再错?

    • \n" + + "
    • QUARTZ 发表于 2013-04-22 09:39
    • \n" + + "
    • 抢先发布,而不关心是否属实,媒体啊、媒体!在一场公众眼球的“盛宴”中,对那些真正重要的问题选择集体漠视。争夺注意力有那么重要吗?有吗?
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"亚马逊首位数据挖掘负责人往事:开发出亚马逊最赚钱项目\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"马化腾与马云为何将数百万捐款,都放入壹基金参与雅安救助?\"/
    \n" + + "
    \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \"这16人从科技出发,在时代中留下烙印\"/
    \n" + + "
    \n" + + "
      \n" + + "
    • 这16人从科技出发,在时代中留下烙印

    • \n" + + "
    • TIME.com 发表于 2013-04-20 07:27
    • \n" + + "
    • 这些人从科技出发,影响了“时代”,就像逝去的乔布斯曾说的,“在宇宙中留下一道烙印”。无一不具备冒险与探索精神。
    • \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    更多
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
    \n" + + "
      \n" + + "
    • \n" + + "
      \n" + + "
      \n" + + "

      产品情感化设计的两个层面

      \n" + + "

      云瑞

      \n" + + " 2013-04-22\n" + + "
      \n" + + "
    • \n" + + "
    • 现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。文案内容的情感化也会增加用户的接受程度
    • \n" + + "
    • 评论(0) 产品 投稿
    • \n" + + "
    \n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "
    \n" + + "
      \n" + + "
    • \n" + + "
      \n" + + "
      \n" + + "

      这一次救灾,互联网好样的!

      \n" + + "

      葛甲

      \n" + + " 2013-04-22\n" + + "
      \n" + + "
    • \n" + + "
    • 本次四川雅安地震之后,网上谣言少了,辟谣的多了;传谣的少了,不信谣的多了;阴谋论少了,正能量多了;博眼球的企业少了,做实事的企业多了
    • \n" + + "
    • 评论(4) 公益
    • \n" + + "
    \n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "
    \n" + + "
      \n" + + "
    • \n" + + "
      \n" + + "
      \n" + + "

      关于地震和救灾的常见误区

      \n" + + "

      左志坚

      \n" + + " 2013-04-21\n" + + "
      \n" + + "
    • \n" + + "
    • 这一次,又是举国沸腾,但一些认知误区仍然存在。我想就这五年来的观察,做一些简单的总结,希望对关心灾区的朋友有些帮助和启发
    • \n" + + "
    • 评论(8) 公益
    • \n" + + "
    \n" + + "
    \n" + + " \n" + + "
    \n" + + "
    \t
    \n" + + "\t\t\n" + + "\t
    \n" + + "\t
    \n" + + "\t\t
    \n" + + "\t\t\t
    \"官方微信\"
    \n" + + "\t\t\t
    \n" + + "\t\t\t\t
      \n" + + "\t\t\t\t\t
    • 官方微信
    • \n" + + "\t\t\t\t\t
    • 微信扫描二维码,
      获得每日精选资讯
    • \n" + + "\t\t\t\t
    \n" + + "\t\t\t
    \n" + + "\t\t
    \n" + + "\t\t
    \n" + + "\t\t\t
      \n" + + "\t\t\t\t
    • 官方微博
    • \n" + + "\t\t\t\t
    • \n" + + "\t\t\t\t
    • \n" + + "\t\t\t
    \n" + + "\t\t
    \n" + + "\t
    \n" + + "
    \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \t
    \n" + + "\n" + + "\n" + + "
    \n" + + "
    \n" + + "

    关于我们|加入我们|广告及服务|常见问题解答|提交建议\n" + + "\n" + + "

    \n" + + "

    Copyright © 虎嗅网\n" + + "( 京ICP备12013432 )

    \n" + + "
    \n" + + "
    \n" + + "\n" + + " 
    \n" + + "\n" + + "回顶部\n" + + "\n" + + "\t\t\t
    \n" + + "\t\t\t\n" + + "\t\t\t\n" + + "\n"; + String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); + String text = "订阅虎嗅"; + Assert.assertTrue(html.contains(" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +