From ad66d33f3877a10aec3e3c20792e5abdcf9f2e6a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 20 Aug 2013 23:39:59 +0800 Subject: [PATCH 01/39] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 4bcacc63..aeb54f61 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 pom webmagic-parent @@ -32,7 +32,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.2.1 + HEAD diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 06791067..b19820df 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 29ad49d7..1914b71b 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 From ce988d2f19b1a9c29141fdfd7e309c6c961cf4fc Mon Sep 17 00:00:00 2001 From: linkerlin Date: Wed, 21 Aug 2013 06:55:18 +0800 Subject: [PATCH 02/39] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86=E5=AF=B9?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E7=BC=96=E7=A0=81=E7=9A=84=E5=BC=BA=E5=88=B6?= =?UTF-8?q?=E9=99=90=E5=AE=9A=EF=BC=8C=E9=99=90=E5=AE=9A=E4=BA=8EUTF-8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pom.xml b/pom.xml index aeb54f61..e3bd30e0 100644 --- a/pom.xml +++ b/pom.xml @@ -9,6 +9,10 @@ 0.2.2-SNAPSHOT 4.0.0 pom + + UTF-8 + UTF-8 + webmagic-parent webmagic-parent From 9bba0b2b725f22e867265ce7f39e4c5e7ac9de27 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 21 Aug 2013 07:36:41 +0800 Subject: [PATCH 03/39] update readme version --- README.md | 14 ++++---------- release.properties | 11 ----------- webmagic manual.md | 11 +++-------- zh_docs/README.md | 12 +++--------- 4 files changed, 10 insertions(+), 38 deletions(-) delete mode 100644 release.properties diff --git a/README.md b/README.md index 47c93f60..fa726dae 100644 --- a/README.md +++ b/README.md @@ -16,24 +16,18 @@ webmagic ## Install: - -Clone the repo and build: - - git clone https://github.com/code4craft/webmagic.git - cd webmagic - mvn clean install - -Add dependencies to your project: + +Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.2.0 + 0.2.1 us.codecraft webmagic-extension - 0.2.0 + 0.2.1 ## Get Started: diff --git a/release.properties b/release.properties deleted file mode 100644 index 86e7224e..00000000 --- a/release.properties +++ /dev/null @@ -1,11 +0,0 @@ -#release configuration -#Tue Aug 20 23:36:56 CST 2013 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git -preparationGoals=clean verify -remoteTagging=true -scm.commentPrefix=[maven-release-plugin] -exec.additionalArguments=-Psonatype-oss-release -P development -exec.snapshotReleasePluginAllowed=false -completedPhase=check-poms diff --git a/webmagic manual.md b/webmagic manual.md index 0f4d4e4b..b9d75418 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -21,22 +21,17 @@ webmagic使用手册 ### 使用maven -webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: - - git clone https://github.com/code4craft/webmagic.git - mvn clean install - -安装后,在项目中添加对应的依赖即可使用webmagic: +webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: us.codecraft webmagic-core - 0.2.0 + 0.2.1 us.codecraft webmagic-extension - 0.2.0 + 0.2.1 #### 项目结构 diff --git a/zh_docs/README.md b/zh_docs/README.md index 31eb2ba9..a990244e 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -29,23 +29,17 @@ Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitca ### 使用maven -webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: - - git clone https://github.com/code4craft/webmagic.git - cd webmagic - mvn clean install - -安装后,在项目中添加对应的依赖即可使用webmagic: +webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: us.codecraft webmagic-core - 0.2.0 + 0.2.1 us.codecraft webmagic-extension - 0.2.0 + 0.2.1 #### 项目结构 From 4fa82aad20b0d208c8c2b17af2644f82b26c1b75 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 21 Aug 2013 07:44:39 +0800 Subject: [PATCH 04/39] readme --- webmagic manual.md | 2 +- zh_docs/README.md | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/webmagic manual.md b/webmagic manual.md index b9d75418..dc09b907 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -46,7 +46,7 @@ webmagic主要包括两个包: webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 -webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译: * **webmagic-saxon** diff --git a/zh_docs/README.md b/zh_docs/README.md index a990244e..ee8580c1 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -39,7 +39,8 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-extension - 0.2.1 + 0.2.1 + #### 项目结构 @@ -54,7 +55,7 @@ webmagic主要包括两个包: webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 -webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译:: * **webmagic-saxon** From 91dcccf7b574549c7aed4740c74ed30bce6aa795 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 21 Aug 2013 21:55:15 +0800 Subject: [PATCH 05/39] add a sample --- .../main/java/us/codecraft/webmagic/Page.java | 17 ++++ .../scheduler/ZipCodePageProcessor.java | 83 +++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index afdf2320..93c184d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -87,6 +87,23 @@ public class Page { } } + /** + * add urls to fetch + * + * @param requests + */ + public void addTargetRequests(List requests,long priority) { + synchronized (targetRequests) { + for (String s : requests) { + if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { + break; + } + s = UrlUtils.canonicalizeUrl(s, url.toString()); + targetRequests.add(new Request(s).setPriority(priority)); + } + } + } + /** * add url to fetch * diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java new file mode 100644 index 00000000..e6b3f66c --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -0,0 +1,83 @@ +package us.codecraft.webmagic.samples.scheduler; + +import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +import java.util.List; + +import static us.codecraft.webmagic.selector.Selectors.regex; +import static us.codecraft.webmagic.selector.Selectors.xpath; + +/** + * @author code4crafter@gmail.com + */ +public class ZipCodePageProcessor implements PageProcessor { + + private Site site = Site.me().setCharset("gb2312").setSleepTime(0).addStartUrl("http://www.ip138.com/post/"); + + @Override + public void process(Page page) { + if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { + processCountry(page); + } else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) { + processProvince(page); + } else { + processDistrict(page); + } + + } + + private void processCountry(Page page) { + List provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all(); + for (String province : provinces) { + String link = xpath("//@href").select(province); + String title = xpath("/text()").select(province); + Request request = new Request(link).setPriority(0).putExtra("province", title); + page.addTargetRequest(request); + } + } + + private void processProvince(Page page) { + //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 + List districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all(); + for (String district : districts) { + String link = xpath("//@href").select(district); + String title = xpath("/text()").select(district); + Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); + page.addTargetRequest(request); + } + } + + private void processDistrict(Page page) { + String province = page.getRequest().getExtra("province").toString(); + String district = page.getRequest().getExtra("district").toString(); + List counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*\\d+.*").all(); + String regex = "]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)"; + for (String county : counties) { + String county0 = regex(regex, 1).select(county); + String county1 = regex(regex, 2).select(county); + String zipCode = regex(regex, 3).select(county); + page.putField("result", StringUtils.join(new String[]{province, district, + county0, county1, zipCode}, "\t")); + } + List links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all(); + for (String link : links) { + page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); + } + + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run(); + } +} From 0b9e0465ed0fc18541408aa9d496edc0eb0ecf7d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 21 Aug 2013 23:49:15 +0800 Subject: [PATCH 06/39] add delay queue --- .../scheduler/DelayQueueScheduler.java | 82 +++++++++++++++++++ .../scheduler/LevelLimitScheduler.java | 24 ++++++ .../scheduler/ZipCodePageProcessor.java | 8 +- .../scheduler/DelayQueueSchedulerTest.java | 24 ++++++ 4 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java create mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java new file mode 100644 index 00000000..a52b3d4b --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.samples.scheduler; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.DelayQueue; +import java.util.concurrent.Delayed; +import java.util.concurrent.TimeUnit; + +/** + * @author code4crafter@gmail.com + */ +public class DelayQueueScheduler extends PriorityScheduler { + + private DelayQueue queue = new DelayQueue(); + + private Set urls = new HashSet(); + + private long time; + + private TimeUnit timeUnit; + + private class RequestWrapper implements Delayed { + + private long startTime = System.currentTimeMillis(); + + private Request request; + + private RequestWrapper(Request request) { + this.request = request; + } + + private long getStartTime() { + return startTime; + } + + private Request getRequest() { + return request; + } + + @Override + public long getDelay(TimeUnit unit) { + long convert = unit.convert(TimeUnit.MILLISECONDS.convert(time, timeUnit) - System.currentTimeMillis() + startTime, TimeUnit.MILLISECONDS); + return convert; + } + + @Override + public int compareTo(Delayed o) { + return new Long(getDelay(TimeUnit.MILLISECONDS)).compareTo(o.getDelay(TimeUnit.MILLISECONDS)); + } + } + + public DelayQueueScheduler(long time, TimeUnit timeUnit) { + this.time = time; + this.timeUnit = timeUnit; + } + + @Override + public synchronized void push(Request request, Task task) { + if (urls.add(request.getUrl())) { + queue.add(new RequestWrapper(request)); + } + + } + + @Override + public synchronized Request poll(Task task) { + RequestWrapper take = null; + while (take == null) { + try { + take = queue.take(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + queue.add(new RequestWrapper(take.getRequest())); + return take.getRequest(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java new file mode 100644 index 00000000..79ef209f --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.samples.scheduler; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +/** + * @author code4crafter@gmail.com + */ +public class LevelLimitScheduler extends PriorityScheduler { + + private int levelLimit = 3; + + public LevelLimitScheduler(int levelLimit) { + this.levelLimit = levelLimit; + } + + @Override + public synchronized void push(Request request, Task task) { + if (((Integer) request.getExtra("_level")) <= levelLimit) { + super.push(request, task); + } + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index e6b3f66c..ddbaa088 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -18,7 +18,8 @@ import static us.codecraft.webmagic.selector.Selectors.xpath; */ public class ZipCodePageProcessor implements PageProcessor { - private Site site = Site.me().setCharset("gb2312").setSleepTime(0).addStartUrl("http://www.ip138.com/post/"); + private Site site = Site.me().setCharset("gb2312") + .setSleepTime(100).addStartUrl("http://www.ip138.com/post/"); @Override public void process(Page page) { @@ -79,5 +80,10 @@ public class ZipCodePageProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run(); + + PriorityScheduler scheduler = new PriorityScheduler(); + Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler); + scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider); + spider.run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java new file mode 100644 index 00000000..31af3b2b --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.samples.scheduler; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Request; + +import java.util.concurrent.TimeUnit; + +/** + * @author code4crafter@gmail.com + */ +public class DelayQueueSchedulerTest { + + @Ignore("infinite") + @Test + public void test() { + DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS); + delayQueueScheduler.push(new Request("1"), null); + while (true){ + Request poll = delayQueueScheduler.poll(null); + System.out.println(System.currentTimeMillis()+"\t"+poll); + } + } +} From 478ace7e973d5ae924ed3345722cf4ef143c0df8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 22 Aug 2013 07:29:18 +0800 Subject: [PATCH 07/39] add FilePageModelPipeline --- .../pipeline/FilePageModelPipeline.java | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java new file mode 100644 index 00000000..d3ed1f02 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -0,0 +1,55 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HasKey; +import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.utils.FilePersistentBase; + +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * Store results objects (page models) to files in plain format.
+ * Use model.getKey() as file name if the model implements HasKey.
+ * Otherwise use SHA1 as file name. + * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { + + private Logger logger = Logger.getLogger(getClass()); + + /** + * new JsonFilePageModelPipeline with default path "/data/webmagic/" + */ + public FilePageModelPipeline() { + setPath("/data/webmagic/"); + } + + public FilePageModelPipeline(String path) { + setPath(path); + } + + @Override + public void process(Object o, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + try { + String filename; + if (o instanceof HasKey) { + filename = path + ((HasKey) o).key() + ".html"; + } else { + filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html"; + } + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); + printWriter.write(ToStringBuilder.reflectionToString(o)); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} From 0cc0ccee3578ed3ce9a3cd920912bc8b1338da87 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 25 Aug 2013 15:41:43 +0800 Subject: [PATCH 08/39] add charset specific for easy call of HttpClientDownloader --- .../webmagic/downloader/HttpClientDownloader.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 75634104..7a063298 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -46,6 +46,17 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } + /** + * A simple method to download a url. + * + * @param url + * @return html + */ + public Html download(String url,String charset) { + Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); + return (Html) page.getHtml(); + } + @Override public Page download(Request request, Task task) { Site site = null; @@ -87,13 +98,12 @@ public class HttpClientDownloader implements Downloader { } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { + handleGzip(httpResponse); //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } - // - handleGzip(httpResponse); return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); From 5e9e8b2541a3b85dadd222bc923540d51f30b09c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 25 Aug 2013 16:30:38 +0800 Subject: [PATCH 09/39] add TextContentSelector --- .../us/codecraft/webmagic/selector/Html.java | 22 ++++-- .../webmagic/selector/PlainText.java | 21 +++++- .../webmagic/selector/Selectable.java | 21 ++++++ .../webmagic/selector/Selectors.java | 12 ++++ .../selector/TextContentSelector.java | 68 +++++++++++++++++++ .../selector/TextContentSelectorTest.java | 34 ++++++++++ 6 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 1d5e8c59..f3d29aa9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -47,32 +47,44 @@ public class Html extends PlainText { @Override public Selectable smartContent() { - SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); + SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, strings); } @Override public Selectable links() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); + XpathSelector xpathSelector = Selectors.xpath("//a/@href"); return selectList(xpathSelector, strings); } @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); + XpathSelector xpathSelector = Selectors.xpath(xpath); return selectList(xpathSelector, strings); } @Override public Selectable $(String selector) { - CssSelector cssSelector = new CssSelector(selector); + CssSelector cssSelector = Selectors.$(selector); return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { - CssSelector cssSelector = new CssSelector(selector, attrName); + CssSelector cssSelector = Selectors.$(selector, attrName); return selectList(cssSelector, strings); } + @Override + public Selectable text() { + TextContentSelector selector = Selectors.text(); + return select(selector, strings); + } + + @Override + public Selectable text(String newlineSeparator) { + TextContentSelector selector = Selectors.text(newlineSeparator); + return select(selector, strings); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index e0501eb9..df6926dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -57,13 +57,13 @@ public class PlainText implements Selectable { @Override public Selectable regex(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); + RegexSelector regexSelector = Selectors.regex(regex); return selectList(regexSelector, strings); } @Override public Selectable regex(String regex, int group) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group); + RegexSelector regexSelector = Selectors.regex(regex, group); return selectList(regexSelector, strings); } @@ -106,4 +106,21 @@ public class PlainText implements Selectable { return null; } } + + @Override + public Selectable text() { + //do nothing + return this; + } + + @Override + public Selectable text(String newlineSeparator) { + //do nothing + return this; + } + + @Override + public boolean match() { + return strings != null && strings.size() > 0; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 21c93817..398906fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,6 +82,27 @@ public interface Selectable { */ public String toString(); + /** + * select text content of html + * + * @return text + */ + public Selectable text(); + + /** + * select text content of html + * + * @return text + */ + public Selectable text(String newlineSeparator); + + /** + * if result exist for select + * + * @return true if result exist + */ + public boolean match(); + /** * multi string result * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index b52d1287..051d6a43 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -16,6 +16,10 @@ public abstract class Selectors { return SelectorFactory.getInstatnce().newRegexSelector(expr, group); } + public static SmartContentSelector smartContent() { + return SelectorFactory.getInstatnce().newSmartContentSelector(); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } @@ -36,6 +40,14 @@ public abstract class Selectors { return new OrSelector(selectors); } + public static TextContentSelector text() { + return new TextContentSelector(); + } + + public static TextContentSelector text(String newlineSeperator) { + return new TextContentSelector(newlineSeperator); + } + public static void main(String[] args) { String s = "a"; or(regex("(.*)"), xpath("//title"), $("title")).select(s); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java new file mode 100644 index 00000000..54e82042 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Extract text content in html.
+ * Algorithm from http://www.elias.cn/En/ExtMainText.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class TextContentSelector implements Selector { + + private String newLineSeperator = "\n"; + + public TextContentSelector() { + } + + public TextContentSelector(String newLineSeperator) { + this.newLineSeperator = newLineSeperator; + } + + private final static Set TAGS_IN_NEWLINE = new HashSet(); + + private final static Set TAGS_TO_IGNORE = new HashSet(); + + static { + TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"})); + TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"})); + } + + @Override + public String select(String text) { + Document doc = Jsoup.parse(text); + return select0(doc); + } + + protected String select0(Element element) { + String tagName = element.tagName().toLowerCase(); + if (TAGS_TO_IGNORE.contains(tagName)) { + return ""; + } + StringBuilder textBuilder = new StringBuilder(); + textBuilder.append(element.text()); + if (element.children() != null) { + for (Element child : element.children()) { + textBuilder.append(select0(child)); + } + } + if (TAGS_IN_NEWLINE.contains(tagName)) { + textBuilder.append(newLineSeperator); + } + return textBuilder.toString(); + } + + @Override + public List selectList(String text) { + throw new UnsupportedOperationException(); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java new file mode 100644 index 00000000..a7a294a6 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.selector; + +import junit.framework.Assert; +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.downloader.HttpClientDownloader; + +/** + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class TextContentSelectorTest { + + @Test + public void test() { + String html = "
\n" + + "
\n" + + "

Add more powerful selector for content text extract refered to http://www.elias.cn/En/ExtMainText

\n" + + "
\n" + + "
"; + TextContentSelector textContentSelector = new TextContentSelector("
"); + String text = textContentSelector.select(html); + Assert.assertNotNull(text); + } + + @Ignore("takes long time") + @Test + public void testDownload() { + String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") + .smartContent().text().toString(); + Assert.assertNotNull(text); + } + +} From d7abbd0e4bf5a922b3abc3e75ebed7328df124e9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 25 Aug 2013 16:31:00 +0800 Subject: [PATCH 10/39] fix compile error --- .../us/codecraft/webmagic/selector/TextContentSelectorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java index a7a294a6..f5018249 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java @@ -28,7 +28,7 @@ public class TextContentSelectorTest { public void testDownload() { String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") .smartContent().text().toString(); - Assert.assertNotNull(text); + Assert.assertNotNull(s); } } From e87489d5dc74d59bc407271bdc2d3f9c9493c118 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 30 Aug 2013 17:32:27 +0800 Subject: [PATCH 11/39] syntax hightlight --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index fa726dae..319798e8 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Add dependencies to your pom.xml: Write a class implements PageProcessor: +```java public class OschinaBlogPageProcesser implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net") @@ -61,6 +62,7 @@ Write a class implements PageProcessor: .pipeline(new ConsolePipeline()).run(); } } +``` * `page.addTargetRequests(links)` @@ -68,6 +70,7 @@ Write a class implements PageProcessor: You can also use annotation way: +```java @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @@ -86,6 +89,7 @@ You can also use annotation way: new ConsolePageModelPipeline(), OschinaBlog.class).run(); } } +``` ### Docs and samples: From b1cba78bd6930bbbc3d44b4825fcc752932ca02c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 07:30:31 +0800 Subject: [PATCH 12/39] xsoup test --- webmagic-saxon/pom.xml | 5 ++ .../webmagic/selector/XpathSelectorTest.java | 77 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index efa82919..1c4e745e 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -17,6 +17,11 @@ webmagic-core ${project.version} + + us.codecraft + xsoup + 0.0.1-SNAPSHOT + net.sf.saxon Saxon-HE diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index b6230406..6c19c8ad 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,8 +1,15 @@ package us.codecraft.webmagic.selector; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.htmlcleaner.XPatherException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1353,6 +1360,7 @@ public class XpathSelectorTest { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); + Selectors.xpath("/abc/").select(""); } @Test @@ -1379,17 +1387,86 @@ public class XpathSelectorTest { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } + System.out.println(System.currentTimeMillis() - time); + + CssSelector cssSelector = new CssSelector("a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + cssSelector.selectList(html); + } + System.out.println("css "+(System.currentTimeMillis()-time)); + } + + @Ignore("take long time") + @Test + public void parserPerformanceTest() throws XPatherException { + System.out.println(html.length()); + + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(html); + Document document = Jsoup.parse(html); + + long time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + Jsoup.parse(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + document.select("a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + XPathEvaluator compile = Xsoup.compile("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + compile.evaluate(document); + } + System.out.println(System.currentTimeMillis()-time); + } } From 55d4a76ab7f6238a60e917371ea54164d569edab Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 08:21:32 +0800 Subject: [PATCH 13/39] newselectors --- webmagic-core/pom.xml | 6 ++++ .../selector/BaseElementSelector.java | 23 +++++++++++++ .../webmagic/selector/CssSelector.java | 26 +++++++-------- .../webmagic/selector/ElementSelector.java | 32 +++++++++++++++++++ .../webmagic/selector/XsoupSelector.java | 32 +++++++++++++++++++ 5 files changed, 104 insertions(+), 15 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b19820df..ef9f84aa 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -25,6 +25,12 @@ commons-lang3
+ + us.codecraft + xsoup + 0.0.1-SNAPSHOT + + log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java new file mode 100644 index 00000000..d14a708a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class BaseElementSelector implements Selector,ElementSelector { + + @Override + public String select(String text) { + return select(Jsoup.parse(text)); + } + + @Override + public List selectList(String text) { + return selectList(Jsoup.parse(text)); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 5031077c..9c7032c0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,8 +1,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -15,7 +13,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class CssSelector implements Selector { +public class CssSelector extends BaseElementSelector { private String selectorText; @@ -30,16 +28,6 @@ public class CssSelector implements Selector { this.attrName = attrName; } - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - Elements elements = doc.select(selectorText); - if (CollectionUtils.isEmpty(elements)) { - return null; - } - return getValue(elements.get(0)); - } - private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); @@ -51,9 +39,17 @@ public class CssSelector implements Selector { } @Override - public List selectList(String text) { + public String select(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isEmpty(elements)) { + return null; + } + return getValue(elements.get(0)); + } + + @Override + public List selectList(Element doc) { List strings = new ArrayList(); - Document doc = Jsoup.parse(text); Elements elements = doc.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java new file mode 100644 index 00000000..793b8256 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Selector(extractor) for html elements.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public interface ElementSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param element + * @return result + */ + public String select(Element element); + + /** + * Extract all results in text.
+ * + * @param element + * @return results + */ + public List selectList(Element element); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java new file mode 100644 index 00000000..698b29bd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; + +import java.util.List; + +/** + * XPath selector based on Xsoup.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class XsoupSelector extends BaseElementSelector { + + private XPathEvaluator xPathEvaluator; + + public XsoupSelector(String xpathStr) { + this.xPathEvaluator = Xsoup.compile(xpathStr); + } + + @Override + public String select(Element element) { + return xPathEvaluator.evaluate(element).get(); + } + + @Override + public List selectList(Element element) { + return xPathEvaluator.evaluate(element).list(); + } +} From d7cd9e5747859b41cc5d97fbebfc80bdc88ad78b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 11:56:01 +0800 Subject: [PATCH 14/39] update pom --- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../java/us/codecraft/webmagic/utils/ExtractorUtils.java | 9 +++------ webmagic-samples/pom.xml | 2 +- .../codecraft/webmagic/samples/DiaoyuwengProcessor.java | 7 ++++++- .../us/codecraft/webmagic/samples/F58PageProcesser.java | 9 +++++++-- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 5 +++++ 6 files changed, 23 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index f3d29aa9..493c7629 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -59,7 +59,7 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); + XsoupSelector xpathSelector = new XsoupSelector(xpath); return selectList(xpathSelector, strings); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 5c6ebbf8..10996362 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -1,10 +1,7 @@ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.util.ArrayList; import java.util.List; @@ -27,10 +24,10 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); break; default: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); } return selector; } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 35ddcaa4..a349a68e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 115f1834..3ceba0af 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.PlainText; @@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor { page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } @@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor { } return site; } + + public static void main(String[] args) { + Spider.create(new DiaoyuwengProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 4ffe127b..7124a8c5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); + List strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); + page.putField("body",page.getHtml().xpath("//dd")); } @Override public Site getSite() { return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } + + public static void main(String[] args) { + Spider.create(new F58PageProcesser()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 89b74d63..4ac93107 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -26,4 +27,8 @@ public class HuxiuProcessor implements PageProcessor { return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new HuxiuProcessor()).run(); + } } From 85b7cf1563337ae07e448d3de0f5c5939fa676b6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 13:52:41 +0800 Subject: [PATCH 15/39] complete test --- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 5 ++--- .../webmagic/samples/InfoQMiniBookProcessor.java | 4 ---- .../codecraft/webmagic/samples/IteyeBlogProcessor.java | 3 +-- .../us/codecraft/webmagic/samples/KaichibaProcessor.java | 5 +++++ .../us/codecraft/webmagic/samples/MeicanProcessor.java | 9 +++++++-- .../webmagic/samples/OschinaBlogPageProcesser.java | 9 ++++----- 7 files changed, 20 insertions(+), 17 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 493c7629..a4ea0d37 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -53,7 +53,7 @@ public class Html extends PlainText { @Override public Selectable links() { - XpathSelector xpathSelector = Selectors.xpath("//a/@href"); + XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); return selectList(xpathSelector, strings); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 4ac93107..136eeb83 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,10 +15,9 @@ import java.util.List; public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); + List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().smartContent()); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index b43c3c56..38de3bc0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/temp/webmagic/")) .thread(5) .run(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index c0b3f731..f80f895a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index aff18a6d..0ab6c644 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor { return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new KaichibaProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index a4e6e43b..bfa347d2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor { } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); } @Override @@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor { return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new MeicanProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 8ba7063b..e447003b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor { public void process(Page page) { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).run(); } } From 2c3574537afd2707251e82d248f260cc2e333356 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 14:14:24 +0800 Subject: [PATCH 16/39] refactor in selectors --- .../us/codecraft/webmagic/selector/Html.java | 42 +++++---- .../webmagic/selector/PlainText.java | 14 +-- .../webmagic/selector/Selectable.java | 14 --- .../webmagic/selector/SelectorFactory.java | 91 ------------------- .../webmagic/selector/Selectors.java | 20 ++-- .../selector/TextContentSelector.java | 68 -------------- .../selector/TextContentSelectorTest.java | 34 ------- 7 files changed, 35 insertions(+), 248 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index a4ea0d37..06987d86 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic.selector; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + import java.util.ArrayList; import java.util.List; @@ -11,12 +14,23 @@ import java.util.List; */ public class Html extends PlainText { + /** + * Store parsed document for better performance when only one text exist. + */ + private Document document; + public Html(List strings) { super(strings); } public Html(String text) { super(text); + this.document = Jsoup.parse(text); + } + + public Html(Document document) { + super(document.html()); + this.document = document; } public static Html create(String text) { @@ -53,38 +67,34 @@ public class Html extends PlainText { @Override public Selectable links() { - XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); - return selectList(xpathSelector, strings); + return xpath("//a/@href"); } @Override public Selectable xpath(String xpath) { - XsoupSelector xpathSelector = new XsoupSelector(xpath); - return selectList(xpathSelector, strings); + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document!=null){ + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } - @Override - public Selectable text() { - TextContentSelector selector = Selectors.text(); - return select(selector, strings); - } - - @Override - public Selectable text(String newlineSeparator) { - TextContentSelector selector = Selectors.text(newlineSeparator); - return select(selector, strings); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index df6926dd..9406f3ab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -89,7 +89,7 @@ public class PlainText implements Selectable { @Override public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); return select(replaceSelector, strings); } @@ -107,18 +107,6 @@ public class PlainText implements Selectable { } } - @Override - public Selectable text() { - //do nothing - return this; - } - - @Override - public Selectable text(String newlineSeparator) { - //do nothing - return this; - } - @Override public boolean match() { return strings != null && strings.size() > 0; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 398906fa..66df5d5b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,20 +82,6 @@ public interface Selectable { */ public String toString(); - /** - * select text content of html - * - * @return text - */ - public Selectable text(); - - /** - * select text content of html - * - * @return text - */ - public Selectable text(String newlineSeparator); - /** * if result exist for select * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java deleted file mode 100644 index 8a0c76c9..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.apache.commons.lang3.StringUtils; - -import java.lang.reflect.Constructor; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Selector factory with some inner cache.
- * - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class SelectorFactory { - - private Map innerCache = new ConcurrentHashMap(); - - private static final SelectorFactory INSTATNCE = new SelectorFactory(); - - public static SelectorFactory getInstatnce() { - return INSTATNCE; - } - - public RegexSelector newRegexSelector(String regex) { - return newSelector(RegexSelector.class, regex); - } - - public RegexSelector newRegexSelector(String regex, int group) { - String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group)); - if (innerCache.get(cacheKey) != null) { - return (RegexSelector) innerCache.get(cacheKey); - } - return new RegexSelector(regex, group); - } - - public ReplaceSelector newReplaceSelector(String regex, String replacement) { - return newSelector(ReplaceSelector.class, regex, replacement); - } - - public XpathSelector newXpathSelector(String xpath) { - return newSelector(XpathSelector.class, xpath); - } - - public SmartContentSelector newSmartContentSelector() { - return newSelector(SmartContentSelector.class); - } - - public T newAndCacheSelector(Class clazz, String... param) { - String cacheKey = getCacheKey(RegexSelector.class, param); - if (innerCache.get(cacheKey) != null) { - return (T) innerCache.get(cacheKey); - } - T selector = newSelector(clazz, param); - if (selector != null) { - innerCache.put(cacheKey, selector); - } - return selector; - - } - - public T newSelector(Class clazz, String... param) { - try { - if (param.length == 0) { - Constructor constructor - = clazz.getConstructor(); - T selector = constructor.newInstance(); - return selector; - } else if (param.length == 1) { - Constructor constructor - = clazz.getConstructor(String.class); - T selector = constructor.newInstance(param[0]); - return selector; - } else if (param.length == 2) { - Constructor constructor - = clazz.getConstructor(String.class, String.class); - T selector = constructor.newInstance(param[0], param[1]); - return selector; - } else { - throw new UnsupportedOperationException(); - } - } catch (Exception e) { - throw new IllegalArgumentException("init object error", e); - } - } - - private String getCacheKey(Class clazz, String... param) { - return clazz.toString() + "_" + StringUtils.join(param, "_"); - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 051d6a43..9764641c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector; public abstract class Selectors { public static RegexSelector regex(String expr) { - return SelectorFactory.getInstatnce().newRegexSelector(expr); + return new RegexSelector(expr); } public static RegexSelector regex(String expr, int group) { - return SelectorFactory.getInstatnce().newRegexSelector(expr, group); + return new RegexSelector(expr,group); } public static SmartContentSelector smartContent() { - return SelectorFactory.getInstatnce().newSmartContentSelector(); + return new SmartContentSelector(); } public static CssSelector $(String expr) { @@ -29,7 +29,11 @@ public abstract class Selectors { } public static XpathSelector xpath(String expr) { - return SelectorFactory.getInstatnce().newXpathSelector(expr); + return new XpathSelector(expr); + } + + public static XsoupSelector xsoup(String expr) { + return new XsoupSelector(expr); } public static AndSelector and(Selector... selectors) { @@ -40,14 +44,6 @@ public abstract class Selectors { return new OrSelector(selectors); } - public static TextContentSelector text() { - return new TextContentSelector(); - } - - public static TextContentSelector text(String newlineSeperator) { - return new TextContentSelector(newlineSeperator); - } - public static void main(String[] args) { String s = "a"; or(regex("(.*)"), xpath("//title"), $("title")).select(s); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java deleted file mode 100644 index 54e82042..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java +++ /dev/null @@ -1,68 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Extract text content in html.
- * Algorithm from http://www.elias.cn/En/ExtMainText.
- * - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelector implements Selector { - - private String newLineSeperator = "\n"; - - public TextContentSelector() { - } - - public TextContentSelector(String newLineSeperator) { - this.newLineSeperator = newLineSeperator; - } - - private final static Set TAGS_IN_NEWLINE = new HashSet(); - - private final static Set TAGS_TO_IGNORE = new HashSet(); - - static { - TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"})); - TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"})); - } - - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - return select0(doc); - } - - protected String select0(Element element) { - String tagName = element.tagName().toLowerCase(); - if (TAGS_TO_IGNORE.contains(tagName)) { - return ""; - } - StringBuilder textBuilder = new StringBuilder(); - textBuilder.append(element.text()); - if (element.children() != null) { - for (Element child : element.children()) { - textBuilder.append(select0(child)); - } - } - if (TAGS_IN_NEWLINE.contains(tagName)) { - textBuilder.append(newLineSeperator); - } - return textBuilder.toString(); - } - - @Override - public List selectList(String text) { - throw new UnsupportedOperationException(); - } - -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java deleted file mode 100644 index f5018249..00000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.selector; - -import junit.framework.Assert; -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.downloader.HttpClientDownloader; - -/** - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelectorTest { - - @Test - public void test() { - String html = "
\n" + - "
\n" + - "

Add more powerful selector for content text extract refered to http://www.elias.cn/En/ExtMainText

\n" + - "
\n" + - "
"; - TextContentSelector textContentSelector = new TextContentSelector("
"); - String text = textContentSelector.select(html); - Assert.assertNotNull(text); - } - - @Ignore("takes long time") - @Test - public void testDownload() { - String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") - .smartContent().text().toString(); - Assert.assertNotNull(s); - } - -} From 326b97c65a3e9516d06ef7e46da53757ac04f175 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 00:15:54 +0800 Subject: [PATCH 17/39] update --- .../main/java/us/codecraft/webmagic/Page.java | 7 ++-- .../webmagic/selector/CacheElement.java | 36 +++++++++++++++++++ .../us/codecraft/webmagic/selector/Html.java | 30 ++++++++++++++++ .../webmagic/model/PageModelExtractor.java | 19 ++++++---- 4 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 93c184d8..0821e6d4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -28,7 +29,7 @@ public class Page { private ResultItems resultItems = new ResultItems(); - private Selectable html; + private Html html; private Selectable url; @@ -58,11 +59,11 @@ public class Page { * * @return html */ - public Selectable getHtml() { + public Html getHtml() { return html; } - public void setHtml(Selectable html) { + public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java new file mode 100644 index 00000000..a58eba2a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Cache parsed element for extract. + * + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public class CacheElement { + + public String text; + + public Element element; + + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getElement()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getElement()); + } else { + return selector.selectList(getText()); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 06987d86..74aa976b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -97,4 +97,34 @@ public class Html extends PlainText { return selectList(cssSelector, strings); } + public Document getDocument() { + return document; + } + + public String getText() { + return document.html(); + } + + /** + * + * @param selector + * @return + */ + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getDocument()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getDocument()); + } else { + return selector.selectList(getText()); + } + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a16c7a1b..88490524 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -34,7 +35,7 @@ class PageModelExtractor { private List fieldExtractors; - private Extractor extractor; + private Extractor objectExtractor; public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); @@ -169,7 +170,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } @@ -183,12 +184,12 @@ class PageModelExtractor { if (!matched) { return null; } - if (extractor == null) { + if (objectExtractor == null) { return processSingle(page, page.getHtml().toString()); } else { - if (extractor.multi) { + if (objectExtractor.multi) { List os = new ArrayList(); - List list = extractor.getSelector().selectList(page.getHtml().toString()); + List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); if (o != null) { @@ -197,13 +198,19 @@ class PageModelExtractor { } return os; } else { - String select = extractor.getSelector().select(page.getHtml().toString()); + String select = objectExtractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; } } } + private List select(Selector selector,Element element,String html){ + if (selector instanceof ElementSelector){ + + } + } + private Object processSingle(Page page, String html) { Object o = null; try { From 2ec6abfa65d0c68d949c1d7257d17ed44cc53681 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 00:16:20 +0800 Subject: [PATCH 18/39] fix grammer error in en --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 319798e8..752e662b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ webmagic [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) ->A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. +>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. ## Features: From b9eeb88f7773a8de62a86f2b68ae2a31994c29e4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 07:51:18 +0800 Subject: [PATCH 19/39] benchmark --- .../webmagic/model/ProcessorBenchmark.java | 891 ++++++++++++++++++ 1 file changed, 891 insertions(+) create mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java new file mode 100644 index 00000000..5513305d --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java @@ -0,0 +1,891 @@ +package us.codecraft.webmagic.model; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Ignore + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +} From 194518fd82f31e1a08f8966f26324c2e9381ddc3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 08:21:34 +0800 Subject: [PATCH 20/39] add switch --- .../java/us/codecraft/webmagic/Spider.java | 9 + .../webmagic/selector/CacheElement.java | 36 - .../us/codecraft/webmagic/selector/Html.java | 26 +- .../webmagic/utils/EnvironmentUtil.java | 28 + .../webmagic/utils/EnvironmentUtilTest.java | 18 + .../webmagic/model/PageModelExtractor.java | 31 +- .../webmagic/utils/ExtractorUtils.java | 17 +- .../codecraft/model/ProcessorBenchmark.java | 890 ++++++++++++++++++ 8 files changed, 992 insertions(+), 63 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java create mode 100644 webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c5c239fb..723e8058 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; import java.io.Closeable; @@ -368,6 +369,14 @@ public class Spider implements Runnable, Task { return this; } + /** + * switch off xsoup + * @return + */ + public static void xsoupOff(){ + EnvironmentUtil.setUseXsoup(false); + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java deleted file mode 100644 index a58eba2a..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Element; - -import java.util.List; - -/** - * Cache parsed element for extract. - * - * @author code4crafter@gmail.com - * @since 0.2.2 - */ -public class CacheElement { - - public String text; - - public Element element; - - public String select(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.select(getElement()); - } else { - return selector.select(getText()); - } - } - - public List selectList(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.selectList(getElement()); - } else { - return selector.selectList(getText()); - } - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 74aa976b..17988249 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import us.codecraft.webmagic.utils.EnvironmentUtil; import java.util.ArrayList; import java.util.List; @@ -72,17 +73,22 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XsoupSelector xsoupSelector = new XsoupSelector(xpath); - if (document!=null){ - return new Html(xsoupSelector.selectList(document)); + if (EnvironmentUtil.useXsoup()) { + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document != null) { + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); + } else { + XpathSelector xpathSelector = new XpathSelector(xpath); + return selectList(xpathSelector, strings); } - return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -91,7 +97,7 @@ public class Html extends PlainText { @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -102,15 +108,17 @@ public class Html extends PlainText { } public String getText() { + if (strings!=null&&strings.size()>0){ + return strings.get(0); + } return document.html(); } /** - * * @param selector * @return */ - public String select(Selector selector) { + public String selectDocument(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); @@ -119,7 +127,7 @@ public class Html extends PlainText { } } - public List selectList(Selector selector) { + public List selectDocumentForList(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java new file mode 100644 index 00000000..1d63aecd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.BooleanUtils; + +import java.util.Properties; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class EnvironmentUtil { + + private static final String USE_XSOUP = "xsoup"; + + public static boolean useXsoup() { + Properties properties = System.getProperties(); + Object o = properties.get(USE_XSOUP); + if (o == null) { + return true; + } + return BooleanUtils.toBoolean(((String) o).toLowerCase()); + } + + public static void setUseXsoup(boolean useXsoup) { + Properties properties = System.getProperties(); + properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false")); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java new file mode 100644 index 00000000..cb620e7a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; + +import static junit.framework.Assert.*; + +/** + * @author code4crafter@gmail.com + */ +public class EnvironmentUtilTest { + + @Test + public void test() { + assertTrue(EnvironmentUtil.useXsoup()); + EnvironmentUtil.setUseXsoup(false); + assertFalse(EnvironmentUtil.useXsoup()); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 88490524..03cd3a3a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; -import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -185,13 +184,13 @@ class PageModelExtractor { return null; } if (objectExtractor == null) { - return processSingle(page, page.getHtml().toString()); + return processSingle(page, null, false); } else { if (objectExtractor.multi) { List os = new ArrayList(); List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { - Object o = processSingle(page, s); + Object o = processSingle(page, s, false); if (o != null) { os.add(o); } @@ -199,19 +198,13 @@ class PageModelExtractor { return os; } else { String select = objectExtractor.getSelector().select(page.getHtml().toString()); - Object o = processSingle(page, select); + Object o = processSingle(page, select, false); return o; } } } - private List select(Selector selector,Element element,String html){ - if (selector instanceof ElementSelector){ - - } - } - - private Object processSingle(Page page, String html) { + private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); @@ -220,10 +213,14 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().selectList(html); + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); @@ -239,10 +236,14 @@ class PageModelExtractor { String value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().select(html); + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 10996362..2d9fd51f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -8,6 +8,7 @@ import java.util.List; /** * Tools for annotation converting.
+ * * @author code4crafter@gmail.com
* @since 0.2.1 */ @@ -24,17 +25,27 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); break; default: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); + } + return selector; + } + + private static Selector getXpathSelector(String value) { + Selector selector; + if (EnvironmentUtil.useXsoup()) { + selector = new XsoupSelector(value); + } else { + selector = new XpathSelector(value); } return selector; } public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); - if (extractBies==null){ + if (extractBies == null) { return selectors; } for (ExtractBy extractBy : extractBies) { diff --git a/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java new file mode 100644 index 00000000..c3f2829b --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java @@ -0,0 +1,890 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +} From aefd0569a5bfb2f8a99de948ccac38302af19500 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 09:36:56 +0800 Subject: [PATCH 21/39] update version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- .../us/codecraft/webmagic/selector/BaseElementSelector.java | 2 +- .../java/us/codecraft/webmagic/selector/ElementSelector.java | 2 +- .../main/java/us/codecraft/webmagic/selector/XsoupSelector.java | 2 +- .../main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java | 2 +- webmagic-extension/pom.xml | 2 +- .../us/codecraft/webmagic/pipeline/FilePageModelPipeline.java | 2 +- webmagic-samples/pom.xml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index e3bd30e0..5b47984b 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ef9f84aa..9e3d4a2e 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index d14a708a..e313f243 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -6,7 +6,7 @@ import java.util.List; /** * @author code4crafter@gmail.com - * @since 0.2.2 + * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector,ElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java index 793b8256..e422ac8c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -8,7 +8,7 @@ import java.util.List; * Selector(extractor) for html elements.
* * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public interface ElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java index 698b29bd..ea46290a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -10,7 +10,7 @@ import java.util.List; * XPath selector based on Xsoup.
* * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public class XsoupSelector extends BaseElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java index 1d63aecd..7aa5c13e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -6,7 +6,7 @@ import java.util.Properties; /** * @author code4crafter@gmail.com - * @since 0.2.2 + * @since 0.3.0 */ public abstract class EnvironmentUtil { diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1914b71b..4cad2b07 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index d3ed1f02..55868637 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -18,7 +18,7 @@ import java.io.PrintWriter; * Otherwise use SHA1 as file name. * * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a349a68e..a620ae51 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 From a1ef2523cca08c9e08c0ee3ed60de3ab8fbeb3b1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 09:38:40 +0800 Subject: [PATCH 22/39] update xsoup version --- pom.xml | 5 +++++ webmagic-core/pom.xml | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 5b47984b..58792ff4 100644 --- a/pom.xml +++ b/pom.xml @@ -63,6 +63,11 @@ httpclient 4.2.4 + + us.codecraft + xsoup + 0.1.0 + net.sf.saxon Saxon-HE diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9e3d4a2e..388cd6e2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -28,7 +28,6 @@ us.codecraft xsoup - 0.0.1-SNAPSHOT From d141541ef30bc6a9b12a9432bd9a5795008f3d10 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 09:57:19 +0800 Subject: [PATCH 23/39] add retry --- .../java/us/codecraft/webmagic/Request.java | 2 ++ .../main/java/us/codecraft/webmagic/Site.java | 23 ++++++++++++++++++- .../downloader/HttpClientDownloader.java | 17 +++++++++++++- .../us/codecraft/webmagic/selector/Html.java | 11 +++++++-- .../webmagic/scheduler/RedisScheduler.java | 8 ++++--- 5 files changed, 54 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 694d32b2..142a20c7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -17,6 +17,8 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; + public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; + private String url; /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 443f2bba..6a351786 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -30,6 +30,8 @@ public class Site { private int retryTimes = 0; + private int cycleRetryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -200,7 +202,7 @@ public class Site { } /** - * Get retry times when download fail, 0 by default.
+ * Get retry times when download fail immediately, 0 by default.
* * @return retry times when download fail */ @@ -218,6 +220,25 @@ public class Site { return this; } + /** + * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
+ * + * @return retry times when download fail + */ + public int getCycleRetryTimes() { + return cycleRetryTimes; + } + + /** + * Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler.
+ * + * @return this + */ + public Site setCycleRetryTimes(int cycleRetryTimes) { + this.cycleRetryTimes = cycleRetryTimes; + return this; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7a063298..82a4a9a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader { * @param url * @return html */ - public Html download(String url,String charset) { + public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); return (Html) page.getHtml(); } @@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader { if (tried > retryTimes) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; + } return null; } logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 17988249..b9b7f02b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selector; +import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.utils.EnvironmentUtil; @@ -15,6 +16,8 @@ import java.util.List; */ public class Html extends PlainText { + private Logger logger = Logger.getLogger(getClass()); + /** * Store parsed document for better performance when only one text exist. */ @@ -26,7 +29,11 @@ public class Html extends PlainText { public Html(String text) { super(text); - this.document = Jsoup.parse(text); + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + logger.warn("parse document error ", e); + } } public Html(Document document) { @@ -108,7 +115,7 @@ public class Html extends PlainText { } public String getText() { - if (strings!=null&&strings.size()>0){ + if (strings != null && strings.size() > 0) { return strings.get(0); } return document.html(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e1916279..cd906255 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler { public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); try { - //使用Set进行url去重 - if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { - //使用List保存队列 + // if cycleRetriedTimes is set, allow duplicated. + Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES); + // use set to remove duplicate url + if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { + // use list to store queue jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); if (request.getExtras() != null) { From 891d845e5f87f0bb24607a13024128f856d0b62a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 10:00:13 +0800 Subject: [PATCH 24/39] fix test --- .../codecraft/model/ProcessorBenchmark.java | 890 ------------------ 1 file changed, 890 deletions(-) delete mode 100644 webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java diff --git a/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java deleted file mode 100644 index c3f2829b..00000000 --- a/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java +++ /dev/null @@ -1,890 +0,0 @@ -package us.codecraft.webmagic.model; - -import org.junit.Test; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.model.samples.OschinaBlog; -import us.codecraft.webmagic.selector.Html; -import us.codecraft.webmagic.selector.PlainText; - -/** - * @author code4crafter@gmail.com - */ -public class ProcessorBenchmark { - - @Test - public void test() { - ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); - Page page = new Page(); - page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); - page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); - page.setHtml(new Html(html)); - long time = System.currentTimeMillis(); - for (int i = 0; i < 1000; i++) { - modelPageProcessor.process(page); - } - System.out.println(System.currentTimeMillis() - time); - time = System.currentTimeMillis(); - for (int i = 0; i < 1000; i++) { - modelPageProcessor.process(page); - } - System.out.println(System.currentTimeMillis() - time); - } - - private String html = "\n" + - "\n" + - "\n" + - "\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t\t
\n" + - " \t开源中国社区\n" + - "
\n" + - "
开源项目发现、使用和交流平台
\n" + - "\t\t
\n" + - " \t\n" + - "
\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\n" + - "\t\t当前访客身份:\n" + - "\t\t\t\t黄亿华 [ 退出 ]\n" + - "\t\t\t\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\n" + - "\t\t
\n" + - "\t\t
\n" + - " \t\t
\n" + - "\t\t\t\t\n" + - "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + - " \t\t\t\n" + - "\t\t\t\t\n" + - "\t\t\t\t
\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - " \t\t\t\t\n" + - "
软件
\n" + - " \n" + - "
\n" + - "\t\t\t\t\t\t\t\n" + - " \t\t
\n" + - "\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "\t
\t\n" + - "\n" + - "
\n" + - "
\n" + - "\t\t切换风格 \"黄亿华\"\n" + - " \n" + - " 黄亿华\n" + - "\t\t\n" + - "\t\t\t\n" + - " \t\t\t修改资料\n" + - "\t\t\t更换头像\n" + - " \t\t\n" + - " \n" + - "
\n" + - "
\n" + - " \t关注(43)\n" + - " \t粉丝(98)\n" + - " \t积分(173)\n" + - "
\n" + - "
\n" + - "
\n" + - "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t.发表博文\n" + - "\t.空间管理\n" + - "
\n" + - " 管理» 博客分类\n" + - " \n" + - "
\n" + - "
\n" + - " 管理» 最新评论 \n" + - "
    \n" + - "\t\t
  • \n" + - "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@searchjack:极好的工具,\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t\t
  • \n" + - "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + - "\t\t查看»\n" + - "\t
  • \n" + - "\t
\n" + - "
\n" + - "
\n" + - "访客统计\n" + - "
    \n" + - "\t
  • 6 (查看最新访客»)
  • \n" + - "
  • 284
  • \n" + - "
  • 817
  • \n" + - "
  • 1888
  • \n" + - "
  • 16453
  • \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\t
\n" + - " \t\n" + - "\t
\n" + - "\t\n" + - " \t
\t\t\n" + - "
\n" + - "

Jsoup代码解读之八-防御XSS攻击

\n" + - "
\n" + - " \t\t \t\t \t\t\n" + - " \t\t\t编辑 | 删除\n" + - " \t\t\n" + - "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + - " \t\t已有1628次阅读 ,共3个评论\n" + - " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + - "
\n" + - "\t \t
\n" + - "

目录:[ - ]

\n" + - " \n" + - " \t
\n" + - " \n" + - "\t \t

\n" + - "\n" + - "

防御XSS攻击的一般原理

\n" + - "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + - "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + - "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + - "
    \n" + - "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + - "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + - "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + - "
\n" + - "\n" + - "

Cleaner与Whitelist

\n" + - "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + - "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + - "
public class Whitelist {\n" +
-            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
-            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
-            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
-            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
-            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
-            "}
\n" + - "

这里定义了标签名/属性名/属性值的白名单。

\n" + - "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + - "
private final class CleaningVisitor implements NodeVisitor {\n" +
-            "    private int numDiscarded = 0;\n" +
-            "    private final Element root;\n" +
-            "    private Element destination; // current element to append nodes to\n" +
-            "\n" +
-            "    private CleaningVisitor(Element root, Element destination) {\n" +
-            "        this.root = root;\n" +
-            "        this.destination = destination;\n" +
-            "    }\n" +
-            "\n" +
-            "    public void head(Node source, int depth) {\n" +
-            "        if (source instanceof Element) {\n" +
-            "            Element sourceEl = (Element) source;\n" +
-            "\n" +
-            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
-            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
-            "                Element destChild = meta.el;\n" +
-            "                destination.appendChild(destChild);\n" +
-            "\n" +
-            "                numDiscarded += meta.numAttribsDiscarded;\n" +
-            "                destination = destChild;\n" +
-            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
-            "                numDiscarded++;\n" +
-            "            }\n" +
-            "        } else if (source instanceof TextNode) {\n" +
-            "            TextNode sourceText = (TextNode) source;\n" +
-            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
-            "            destination.appendChild(destText);\n" +
-            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
-            "            numDiscarded++;\n" +
-            "        }\n" +
-            "    }\n" +
-            "\n" +
-            "    public void tail(Node source, int depth) {\n" +
-            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
-            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
-            "        }\n" +
-            "    }\n" +
-            "}
\n" + - "\n" + - "

结束语

\n" + - "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + - "
    \n" + - "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + - "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + - "
\n" + - "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + - " \t \t \n" + - " \t\n" + - "\t
\n" + - " \t关键字:\n" + - " \t \tJsoup\n" + - " \t \tXSS\n" + - " \t \tOO\n" + - " \t \t
\n" + - "\t \t \n" + - "
\t\t\n" + - "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + - "\t \t
\n" + - "\n" + - " \n" + - "\t
\n" + - "\n" + - "\t\n" + - "\t
\n" + - "\t\n" + - "\t\n" + - "\t\t分享到: \n" + - "\t\t\n" + - "\t\t\n" + - "\t\n" + - " 已有 0人顶\n" + - "\t\n" + - "\t
\n" + - "\t\t\n" + - "
\n" + - "
\n" + - "
\n" + - "

共有 3 条网友评论

\n" + - "\t\t\t
    \n" + - "\t\t\t\t\t\t
  • \n" + - "\t\n" + - "\t\n" + - "\t\n" + - "\t
    \n" + - "\t\t\"静风流云\"\t\t\t\n" + - "\t\n" + - "\t\t
    \n" + - "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + - " \t \t 删除\n" + - "\t\t\t\t\t\t\t\t\t 回复此评论\n" + - "\t\t\t\t\t
    \n" + - "\t\t
    貌似,OSC也是类似处理的。
    \n" + - "\t\t
    \n" + - "
    \n" + - "
  • \t\t\t\t\t
  • \n" + - "\t\n" + - "\t\n" + - "\t\n" + - "\t
    \n" + - "\t\t\"黄亿华\"\t\t\t\n" + - "\t\n" + - "\t\t
    \n" + - "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + - " \t \t 删除\n" + - "\t\t\t\t\t\t\t\t
    \n" + - "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + - "\t\t
    \n" + - "
    \n" + - "
  • \t\t\t\t\t
  • \n" + - "\t\n" + - "\t\n" + - "\t\n" + - "\t
    \n" + - "\t\t\"searchjack\"\t\t\t\n" + - "\t\n" + - "\t\t
    \n" + - "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + - " \t \t 删除\n" + - "\t\t\t\t\t\t\t\t\t 回复此评论\n" + - "\t\t\t\t\t
    \n" + - "\t\t
    极好的工具,
    \n" + - "\t\t
    \n" + - "
    \n" + - "
  • \t\t\t\t
\n" + - "
\n" + - "\t
\n" + - "\n" + - "\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "\t \n" + - "\t \n" + - "\t 文明上网,理性发言\n" + - "
\n" + - "\t回到页首 | 回到评论列表\n" + - "
\n" + - "
\n" + - "\t\n" + - "
\n" + - "\t关闭相关文章阅读\n" + - "\t\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t
\n" + - "\t
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + - "\t开源中国手机客户端:\n" + - "\tAndroid\n" + - "\tiPhone\n" + - "\tWP7\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - ""; -} From 1fc8e104ab8277a5bd002c63659dc05d9f6594d8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 10:32:13 +0800 Subject: [PATCH 25/39] add cycle retry --- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 ++++++ .../us/codecraft/webmagic/samples/F58PageProcesser.java | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 723e8058..47cefd0b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -310,6 +310,12 @@ public class Spider implements Runnable, Task { sleep(site.getSleepTime()); return; } + //for cycle retry + if (page.getHtml()==null){ + addRequest(page); + sleep(site.getSleepTime()); + return; + } pageProcessor.process(page); addRequest(page); if (!page.getResultItems().isSkip()) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 7124a8c5..3d27be8e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -4,6 +4,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -24,10 +25,10 @@ public class F58PageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. } public static void main(String[] args) { - Spider.create(new F58PageProcesser()).run(); + Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); } } From 77ff25231612ade51491da731af6c546de1b38c2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 10:50:50 +0800 Subject: [PATCH 26/39] [maven-release-plugin] prepare release webmagic-0.3.0 --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 58792ff4..19eb36d7 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.0-SNAPSHOT + 0.3.0 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + webmagic-0.3.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 388cd6e2..76526a88 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0-SNAPSHOT + 0.3.0 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 4cad2b07..e58fa020 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0-SNAPSHOT + 0.3.0 4.0.0 From e7bf425df4f875dce4de9479e0cfe8178e1b7c2e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 10:51:01 +0800 Subject: [PATCH 27/39] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 19eb36d7..8b3c9877 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.0 + 0.3.1-SNAPSHOT 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-0.3.0 + HEAD diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 76526a88..eb4a7514 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0 + 0.3.1-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index e58fa020..4cdf0010 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0 + 0.3.1-SNAPSHOT 4.0.0 From a9fc06a916008e9763dec67d240e84d81e94185d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 11:04:36 +0800 Subject: [PATCH 28/39] release note --- release-note.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/release-note.md b/release-note.md index ee3a9624..001568be 100755 --- a/release-note.md +++ b/release-note.md @@ -1,5 +1,19 @@ Release Notes ---- +*2012-9-4* `version:0.3.0` + +* Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup). + + [Xsoup](https://github.com/code4craft/xsoup) is an XPath selector based on Jsoup written by me. It has much better performance than HtmlCleaner. + + Time of processing a page is reduced from 7~9ms to 0.4ms. + + If Xsoup is not stable for your usage, just use `Spider.xsoupOff()` to turn off it and report an issue to me! + +* Add cycle retry times for Site. + + When cycle retry times is set, Spider will put the url which downloading failed back to scheduler, and retry after a cycle of queue. + *2012-8-20* `version:0.2.1` ComboExtractor support for annotation. From e1b6b54097a6657cfe1c43bb99ba8b47518c455f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 11:07:28 +0800 Subject: [PATCH 29/39] update version for samples --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a620ae51..a42a719a 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.0-SNAPSHOT + 0.3.1-SNAPSHOT 4.0.0 From 692de76f869312dc22b479b81b7f0f8b809f1c2f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 15:27:51 +0800 Subject: [PATCH 30/39] fix issue #21 charset detect error --- .../java/us/codecraft/webmagic/utils/UrlUtils.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 7dae1f22..4e1140b4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -98,15 +99,17 @@ public class UrlUtils { return stringBuilder.toString(); } - private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)"); + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { - return matcher.group(1); - } else { - return null; + String charset = matcher.group(1); + if (Charset.isSupported(charset)) { + return charset; + } } + return null; } } From ac4cd391707da1190744a3891af7c62424fd8d37 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 20:37:42 +0800 Subject: [PATCH 31/39] update version --- README.md | 4 ++-- zh_docs/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 752e662b..5624019f 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.2.1 + 0.3.0 us.codecraft webmagic-extension - 0.2.1 + 0.3.0 ## Get Started: diff --git a/zh_docs/README.md b/zh_docs/README.md index ee8580c1..0ef0b4d4 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -34,12 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.2.1 + 0.3.0 us.codecraft webmagic-extension - 0.2.1 + 0.3.0 From 2e8cf0a3dd27503423afe0bc8f3600bcf8ac832b Mon Sep 17 00:00:00 2001 From: Linker Lin Date: Thu, 5 Sep 2013 00:30:10 +0800 Subject: [PATCH 32/39] =?UTF-8?q?=E5=B0=86=E5=8D=95=E5=85=83=E6=B5=8B?= =?UTF-8?q?=E8=AF=95fork=E7=8B=AC=E7=AB=8B=E7=9A=84JVM=E6=9D=A5=E8=B7=91?= =?UTF-8?q?=E3=80=82=E9=81=BF=E5=85=8D=E5=B0=91=E6=95=B0=E6=83=85=E5=86=B5?= =?UTF-8?q?=E9=BB=98=E8=AE=A4maven=E5=BC=80=E7=9A=84JVM=E5=A0=86=E5=A4=AA?= =?UTF-8?q?=E5=B0=8F=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pom.xml b/pom.xml index 8b3c9877..4fdfeee1 100644 --- a/pom.xml +++ b/pom.xml @@ -108,6 +108,14 @@ + + org.apache.maven.plugins + maven-surefire-plugin + + pertest + -Xms1024m -Xmx1024m -Xss1m + + org.apache.maven.plugins maven-compiler-plugin From 4d023b3666cc2101e92540004e5630dd2aa01319 Mon Sep 17 00:00:00 2001 From: Linker Lin Date: Thu, 5 Sep 2013 00:30:52 +0800 Subject: [PATCH 33/39] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=89=94=E9=99=A4?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cd33b618..8e88e25d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target/* *.iml out/ +.idea From ef4cf49feea5ce38b2dc34b7a0e0a54632c07faf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 6 Sep 2013 21:17:36 +0800 Subject: [PATCH 34/39] add stop method to spider #24 --- .../java/us/codecraft/webmagic/Spider.java | 69 ++++++++++++------- .../us/codecraft/webmagic/SpiderTest.java | 28 ++++++++ 2 files changed, 71 insertions(+), 26 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 47cefd0b..40fb70db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -79,22 +79,22 @@ public class Spider implements Runnable, Task { * create a spider with pageProcessor. * * @param pageProcessor + * @return new spider + * @see PageProcessor */ - public Spider(PageProcessor pageProcessor) { - this.pageProcessor = pageProcessor; - this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); } /** * create a spider with pageProcessor. * * @param pageProcessor - * @return new spider - * @see PageProcessor */ - public static Spider create(PageProcessor pageProcessor) { - return new Spider(pageProcessor); + public Spider(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); } /** @@ -105,7 +105,7 @@ public class Spider implements Runnable, Task { * @return this */ public Spider startUrls(List startUrls) { - checkIfNotRunning(); + checkIfRunning(); this.startUrls = startUrls; return this; } @@ -139,11 +139,11 @@ public class Spider implements Runnable, Task { * * @param scheduler * @return this - * @since 0.2.1 * @see Scheduler + * @since 0.2.1 */ public Spider setScheduler(Scheduler scheduler) { - checkIfNotRunning(); + checkIfRunning(); this.scheduler = scheduler; return this; } @@ -153,8 +153,8 @@ public class Spider implements Runnable, Task { * * @param pipeline * @return this - * @deprecated * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated */ public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); @@ -165,11 +165,11 @@ public class Spider implements Runnable, Task { * * @param pipeline * @return this - * @since 0.2.1 * @see Pipeline + * @since 0.2.1 */ public Spider addPipeline(Pipeline pipeline) { - checkIfNotRunning(); + checkIfRunning(); this.pipelines.add(pipeline); return this; } @@ -189,8 +189,8 @@ public class Spider implements Runnable, Task { * * @param downloader * @return this - * @deprecated * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated */ public Spider downloader(Downloader downloader) { return setDownloader(downloader); @@ -198,12 +198,13 @@ public class Spider implements Runnable, Task { /** * set the downloader of spider - * @see Downloader + * * @param downloader * @return this + * @see Downloader */ public Spider setDownloader(Downloader downloader) { - checkIfNotRunning(); + checkIfRunning(); this.downloader = downloader; return this; } @@ -220,7 +221,8 @@ public class Spider implements Runnable, Task { @Override public void run() { - if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { + if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) + && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!"); } checkComponent(); @@ -228,18 +230,19 @@ public class Spider implements Runnable, Task { for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this); } + startUrls.clear(); } Request request = scheduler.poll(this); - //singel thread + //single thread if (executorService == null) { - while (request != null) { + while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { processRequest(request); request = scheduler.poll(this); } } else { //multi thread final AtomicInteger threadAlive = new AtomicInteger(0); - while (true) { + while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { if (request == null) { //when no request found but some thread is alive, sleep a while. try { @@ -311,7 +314,7 @@ public class Spider implements Runnable, Task { return; } //for cycle retry - if (page.getHtml()==null){ + if (page.getHtml() == null) { addRequest(page); sleep(site.getSleepTime()); return; @@ -342,8 +345,8 @@ public class Spider implements Runnable, Task { } } - protected void checkIfNotRunning() { - if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { + protected void checkIfRunning() { + if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { throw new IllegalStateException("Spider is already running!"); } } @@ -354,6 +357,19 @@ public class Spider implements Runnable, Task { thread.start(); } + public void start() { + runAsync(); + } + + public void stop() { + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + } + + public void stopAndDestroy() { + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + destroy(); + } + /** * start with more than one threads * @@ -361,7 +377,7 @@ public class Spider implements Runnable, Task { * @return this */ public Spider thread(int threadNum) { - checkIfNotRunning(); + checkIfRunning(); this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); @@ -377,9 +393,10 @@ public class Spider implements Runnable, Task { /** * switch off xsoup + * * @return */ - public static void xsoupOff(){ + public static void xsoupOff() { EnvironmentUtil.setUseXsoup(false); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java new file mode 100644 index 00000000..b3249ce2 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; + +/** + * @author code4crafter@gmail.com + */ +public class SpiderTest { + + @Ignore("long time") + @Test + public void testStartAndStop() throws InterruptedException { + Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + System.out.println(1); + } + }); + spider.start(); + Thread.sleep(10000); + spider.stop(); +// spider.run(); + Thread.sleep(10000); + } +} From d2e0f0cd33a957af5eedb62485ad745abed40af7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 6 Sep 2013 21:35:23 +0800 Subject: [PATCH 35/39] #25 use URL api in UrlUtils.canonicalizeUrl() --- .../us/codecraft/webmagic/utils/UrlUtils.java | 52 +++++++------------ .../webmagic/utils/UrlUtilsTest.java | 7 ++- 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 4e1140b4..4e5f67fc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,47 +20,33 @@ public class UrlUtils { /** * canonicalizeUrl + * + * Borrowed from Jsoup. + * * @param url * @param refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { - if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { - return url; - } - if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { - return url; - } - if (StringUtils.startsWith(url, "/")) { - String host = getHost(refer); - return host + url; - } else if (!StringUtils.startsWith(url, ".")) { - refer = reversePath(refer, 1); - return refer + "/" + url; - } else { - Matcher matcher = relativePathPattern.matcher(url); - if (matcher.find()) { - int reverseDepth = matcher.group(1).length(); - refer = reversePath(refer, reverseDepth); - String substring = StringUtils.substring(url, matcher.end()); - return refer + "/" + substring; - } else { - refer = reversePath(refer, 1); - return refer + "/" + url; + URL base; + try { + try { + base = new URL(refer); + } catch (MalformedURLException e) { + // the base is unsuitable, but the attribute may be abs on its own, so try that + URL abs = new URL(refer); + return abs.toExternalForm(); } + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (url.startsWith("?")) + url = base.getPath() + url; + URL abs = new URL(base, url); + return abs.toExternalForm(); + } catch (MalformedURLException e) { + return ""; } } - public static String reversePath(String url, int depth) { - int i = StringUtils.lastOrdinalIndexOf(url, "/", depth); - if (i < 10) { - url = getHost(url); - } else { - url = StringUtils.substring(url, 0, i); - } - return url; - } - public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index d1cbc21e..abe6adcc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -19,13 +19,12 @@ public class UrlUtilsTest { fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); + fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); + Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); } @Test From c17a31a21d342ddc4349417557bc8b63aba0ba07 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 Sep 2013 21:09:49 +0800 Subject: [PATCH 36/39] fix null pointe exception #26 --- .../webmagic/selector/BaseElementSelector.java | 14 +++++++++++--- .../webmagic/model/PageModelExtractor.java | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index e313f243..7d9035f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -2,22 +2,30 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.3.0 */ -public abstract class BaseElementSelector implements Selector,ElementSelector { +public abstract class BaseElementSelector implements Selector, ElementSelector { @Override public String select(String text) { - return select(Jsoup.parse(text)); + if (text != null) { + return select(Jsoup.parse(text)); + } + return null; } @Override public List selectList(String text) { - return selectList(Jsoup.parse(text)); + if (text != null) { + return selectList(Jsoup.parse(text)); + } else { + return new ArrayList(); + } } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 03cd3a3a..54d942c1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -184,7 +184,7 @@ class PageModelExtractor { return null; } if (objectExtractor == null) { - return processSingle(page, null, false); + return processSingle(page, null, true); } else { if (objectExtractor.multi) { List os = new ArrayList(); From d7c7a78177f8b5271330e37d0a79ae5aac199d96 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 Sep 2013 22:19:02 +0800 Subject: [PATCH 37/39] complete test cases --- .../us/codecraft/webmagic/MockDownloader.java | 1130 +++++++++++++++++ .../webmagic/MockPageModelPipeline.java | 14 + .../us/codecraft/webmagic/MockPipeline.java | 13 + .../codecraft/webmagic/model/GithubRepo.java | 87 ++ .../processor/GithubRepoProcessor.java | 35 + .../codecraft/webmagic/main/QuickStarter.java | 2 - .../webmagic/model/samples/GithubRepo.java | 2 - .../webmagic/model/samples/IteyeBlog.java | 2 +- .../webmagic/model/samples/Kr36NewsModel.java | 2 - .../webmagic/model/samples/News163.java | 2 - .../webmagic/model/samples/OschinaAnswer.java | 2 - .../webmagic/model/samples/OschinaBlog.java | 2 - .../samples/DiandianBlogProcessor.java | 2 - .../webmagic/samples/DiaoyuwengProcessor.java | 46 - .../webmagic/samples/F58PageProcesser.java | 34 - .../webmagic/samples/HuxiuProcessor.java | 7 +- .../samples/InfoQMiniBookProcessor.java | 2 - .../webmagic/samples/IteyeBlogProcessor.java | 5 +- .../webmagic/samples/KaichibaProcessor.java | 32 - .../webmagic/samples/MeicanProcessor.java | 38 - .../webmagic/samples/NjuBBSProcessor.java | 3 +- .../samples/OschinaBlogPageProcesser.java | 2 - .../samples/OschinaPageProcesser.java | 2 - .../webmagic/samples/QzoneBlogProcessor.java | 2 - .../webmagic/samples/SinaBlogProcesser.java | 2 - .../webmagic/samples/TianyaPageProcesser.java | 2 - .../src/main/resources/combine.sh | 8 - .../src/main/resources/ftl/wordpress.ftl | 22 - .../src/main/resources/wp-bottom.xml | 2 - .../src/main/resources/wp-head.xml | 35 - .../processor/DiaoyuwengProcessorTest.java | 28 - 31 files changed, 1284 insertions(+), 283 deletions(-) create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java delete mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java delete mode 100644 webmagic-samples/src/main/resources/combine.sh delete mode 100644 webmagic-samples/src/main/resources/ftl/wordpress.ftl delete mode 100644 webmagic-samples/src/main/resources/wp-bottom.xml delete mode 100644 webmagic-samples/src/main/resources/wp-head.xml delete mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java new file mode 100644 index 00000000..8114b040 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java @@ -0,0 +1,1130 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class MockDownloader implements Downloader{ + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + " This repository\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
This repository
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
All repositories
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "
    \n" + + "\n" + + "
  • \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + " 23\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + " Notification status\n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Not watching

    \n" + + " You only receive notifications for discussions in which you participate or are @mentioned.\n" + + " \n" + + " \n" + + " Watch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Watching

    \n" + + " You receive notifications for all discussions in this repository.\n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Ignoring

    \n" + + " You do not receive any notifications for discussions in this repository.\n" + + " \n" + + " \n" + + " Stop ignoring\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
  • \n" + + "\n" + + "
  • \n" + + " \n" + + "
    \n" + + " \n" + + " Unstar\n" + + " \n" + + " \n" + + " Star\n" + + " \n" + + " 78\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + "\n" + + "
  • \n" + + " \n" + + " Fork\n" + + " \n" + + " 65\n" + + "
  • \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "

\n" + + " public\n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + "\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + "\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

HTTPS clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

SSH clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

Subversion checkout URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "

You can clone with\n" + + " HTTPS,\n" + + " SSH,\n" + + " Subversion,\n" + + " and other methods.\n" + + "

\n" + + "\n" + + " \n" + + " \n" + + " Clone in Desktop\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " Download ZIP\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "

A scalable web crawler framework.

\n" + + "
\n" + + "\n" + + "\n" + + " Edit\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + " \n" + + " or cancel\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + " \n" + + " \n" + + " Java\n" + + " 100.0%\n" + + " \n" + + "
  2. \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " Java\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " branch:\n" + + " master\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " Switch branches/tags\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
    \n" + + "
  • \n" + + " Branches\n" + + "
  • \n" + + "
  • \n" + + " Tags\n" + + "
  • \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " en-webmagic\n" + + "
\n" + + "
\n" + + " \n" + + " gh-pages\n" + + "
\n" + + "
\n" + + " \n" + + " master\n" + + "
\n" + + "
\n" + + " \n" + + " xsoup\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "

Create branch:

\n" + + " from ‘master’\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " webmagic-parent-0.2.1\n" + + "
\n" + + "
\n" + + " \n" + + " webmagic-0.3.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.2.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.1.0\n" + + "
\n" + + "
\n" + + "\n" + + "
Nothing to show
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "Show File Finder\n" + + "
\n" + + " \n" + + " \n" + + "\n" + + "
\n" + + "

\n" + + " Fetching latest commit…\n" + + "

\n" + + "
\n" + + "

\"Octocat-spinner-32-eaf2f5\"

\n" + + "

Cannot retrieve the latest commit at this time

\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " en_docs\n" + + " \n" + + " update readme\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-core\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-extension\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-lucene\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-samples\n" + + " \n" + + " update version for samples\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-saxon\n" + + " \n" + + " xsoup test\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-selenium\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " zh_docs\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .gitignore\n" + + " \n" + + " 增加剔除文件\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .travis.yml\n" + + " \n" + + " add jdk\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " README.md\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " pom.xml\n" + + " \n" + + " 将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " release-note.md\n" + + " \n" + + " release note\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic manual.md\n" + + " \n" + + " readme\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " README.md

\n" + + "webmagic

\n" + + "\n" + + "

Readme in Chinese

\n" + + "\n" + + "

\"Build

\n" + + "\n" + + "
\n" + + "

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

\n" + + "
\n" + + "\n" + + "

\n" + + "Features:

\n" + + "\n" + + "
    \n" + + "
  • Simple core with high flexibility.
  • \n" + + "
  • Simple API for html extracting.
  • \n" + + "
  • Annotation with POJO to customize a crawler, no configuration.
  • \n" + + "
  • Multi-thread and Distribution support.
  • \n" + + "
  • Easy to be integrated.
  • \n" + + "

\n" + + "Install:

\n" + + "\n" + + "

Add dependencies to your pom.xml:

\n" + + "\n" + + "
    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-core</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-extension</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "
\n" + + "\n" + + "

\n" + + "Get Started:

\n" + + "\n" + + "

\n" + + "First crawler:

\n" + + "\n" + + "

Write a class implements PageProcessor:

\n" + + "\n" + + "
    public class OschinaBlogPageProcesser implements PageProcessor {\n" +
+            "\n" +
+            "        private Site site = Site.me().setDomain(\"my.oschina.net\")\n" +
+            "           .addStartUrl(\"http://my.oschina.net/flashsword/blog\");\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public void process(Page page) {\n" +
+            "            List<String> links = page.getHtml().links().regex(\"http://my\\\\.oschina\\\\.net/flashsword/blog/\\\\d+\").all();\n" +
+            "            page.addTargetRequests(links);\n" +
+            "            page.putField(\"title\", page.getHtml().xpath(\"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1\").toString());\n" +
+            "            page.putField(\"content\", page.getHtml().$(\"div.content\").toString());\n" +
+            "            page.putField(\"tags\",page.getHtml().xpath(\"//div[@class='BlogTags']/a/text()\").all());\n" +
+            "        }\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public Site getSite() {\n" +
+            "            return site;\n" +
+            "\n" +
+            "        }\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            Spider.create(new OschinaBlogPageProcesser())\n" +
+            "                 .pipeline(new ConsolePipeline()).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "
    \n" + + "
  • \n" + + "

    page.addTargetRequests(links)

    \n" + + "\n" + + "

    Add urls for crawling.

    \n" + + "
  • \n" + + "

You can also use annotation way:

\n" + + "\n" + + "
    @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
+            "    public class OschinaBlog {\n" +
+            "\n" +
+            "        @ExtractBy(\"//title\")\n" +
+            "        private String title;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"div.BlogContent\",type = ExtractBy.Type.Css)\n" +
+            "        private String content;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"//div[@class='BlogTags']/a/text()\", multi = true)\n" +
+            "        private List<String> tags;\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            OOSpider.create(\n" +
+            "                Site.me().addStartUrl(\"http://my.oschina.net/flashsword/blog\"),\n" +
+            "                new ConsolePageModelPipeline(), OschinaBlog.class).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "

\n" + + "Docs and samples:

\n" + + "\n" + + "

The architecture of webmagic (refered to Scrapy)

\n" + + "\n" + + "

\"image\"

\n" + + "\n" + + "

Javadocs: http://code4craft.github.io/webmagic/docs/en/

\n" + + "\n" + + "

There are some samples in webmagic-samples package.

\n" + + "\n" + + "

\n" + + "Lisence:

\n" + + "\n" + + "

Lisenced under Apache 2.0 lisence

\n" + + "\n" + + "

\n" + + "Thanks:

\n" + + "\n" + + "

To write webmagic, I refered to the projects below :

\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " Something went wrong with that request. Please try again.\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n"; + @Override + public Page download(Request request, Task task) { + Page page = new Page(); + page.setHtml(new Html(html)); + page.setRequest(new Request("https://github.com/code4craft/webmagic")); + page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); + return page; + } + + @Override + public void setThread(int threadNum) { + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java new file mode 100644 index 00000000..ea7601b0 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic; + +import junit.framework.Assert; +import us.codecraft.webmagic.model.PageModelPipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPageModelPipeline implements PageModelPipeline{ + @Override + public void process(Object o, Task task) { + Assert.assertNotNull(o); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java new file mode 100644 index 00000000..7572c158 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPipeline implements Pipeline{ + @Override + public void process(ResultItems resultItems, Task task) { + + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java new file mode 100644 index 00000000..5b6319a0 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -0,0 +1,87 @@ +package us.codecraft.webmagic.model; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) +public class GithubRepo implements HasKey { + + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; + + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; + + @ExtractBy("//div[@id='readme']") + private String readme; + + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) + private List language; + + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") + private String star; + + @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") + private String fork; + + @ExtractByUrl + private String url; + + @Test + public void test() { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) + , new PageModelPipeline() { + @Override + public void process(GithubRepo o, Task task) { + Assert.assertEquals("78",o.getStar().trim()); + Assert.assertEquals("65",o.getFork().trim()); + } + }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + + @Override + public String key() { + return author + ":" + name; + } + + public String getName() { + return name; + } + + public String getReadme() { + return readme; + } + + public String getAuthor() { + return author; + } + + public List getLanguage() { + return language; + } + + public String getUrl() { + return url; + } + + public String getStar() { + return star; + } + + public String getFork() { + return fork; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java new file mode 100644 index 00000000..02b2ac1b --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.processor; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class GithubRepoProcessor implements PageProcessor { + @Override + public void process(Page page) { + page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString()); + page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString()); + } + + @Override + public Site getSite() { + return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); + } + + @Test + public void test() { + OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + Assert.assertEquals("78",((String)resultItems.get("star")).trim()); + Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); + } + }).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 69adabb7..074dd0f4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -14,8 +14,6 @@ import java.util.Scanner; /** * @author code4crafter@gmail.com
- * Date: 13-8-7
- * Time: 下午9:24
*/ public class QuickStarter { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 79a20fff..e8998eca 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -14,8 +14,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-10
- * Time: 下午6:37
*/ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index ae945252..7e3dc516 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index bba8d829..de3fdf5d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-11
- * Time: 下午9:29
*/ @TargetUrl("http://www.36kr.com/p/\\d+.html") @HelpUrl("http://www.36kr.com/#/page/\\d+") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 946e7377..e9dfb263 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -16,8 +16,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") public class News163 implements MultiPageModel { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index e878633b..112f86a6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 下午8:25
*/ @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") @HelpUrl("http://www.oschina.net/question/*") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 96de9774..7819b446 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -11,8 +11,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-2
- * Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog implements HasKey{ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a1189e45..25baa1fb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class DiandianBlogProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java deleted file mode 100644 index 3ceba0af..00000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ /dev/null @@ -1,46 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.selector.PlainText; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DiaoyuwengProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); - page.addTargetRequests(requests); - requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("thread")){ - page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); - page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); - page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); - } - } - - @Override - public Site getSite() { - if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new DiaoyuwengProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java deleted file mode 100644 index 3d27be8e..00000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 - */ -public class F58PageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().links().regex(".*/yewu/.*").all(); - page.addTargetRequests(strings); - page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. - } - - public static void main(String[] args) { - Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 136eeb83..7cb7be2c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class HuxiuProcessor implements PageProcessor { @Override @@ -18,13 +16,12 @@ public class HuxiuProcessor implements PageProcessor { List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); - page.putField("content",page.getHtml().smartContent()); + page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); } @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); } public static void main(String[] args) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 38de3bc0..3ef39574 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -10,8 +10,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class InfoQMiniBookProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index f80f895a..26b85e87 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 上午7:31
*/ public class IteyeBlogProcessor implements PageProcessor { @@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setSleepTime(100).setRetryTimes(3); + site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java deleted file mode 100644 index 0ab6c644..00000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ /dev/null @@ -1,32 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- * Date: 13-5-20 - * Time: 下午5:31 - */ -public class KaichibaProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; - page.addTargetRequest("http://kaichiba.com/shop/" + i); - page.putField("title",page.getHtml().xpath("//Title")); - page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - - public static void main(String[] args) { - Spider.create(new KaichibaProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java deleted file mode 100644 index bfa347d2..00000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ /dev/null @@ -1,38 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-5-20 - * Time: 下午5:31 - */ -public class MeicanProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); - if (requests.size() > 2) { - requests = requests.subList(0, 2); - } - page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - - public static void main(String[] args) { - Spider.create(new MeicanProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 2337da59..16dcb0cb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index e447003b..ded1a5f4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaBlogPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 522eb2c6..b75cc832 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 49418b60..d9cee2be 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class QzoneBlogProcessor implements PageProcessor { @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index b4c5bc88..dcb6eff9 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class SinaBlogProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index ecc55b42..d14b4420 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class TianyaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/resources/combine.sh b/webmagic-samples/src/main/resources/combine.sh deleted file mode 100644 index 0e7bd0c8..00000000 --- a/webmagic-samples/src/main/resources/combine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -touch wordpress.xml -cat wp-head.xml >> wordpress.xml -for f in `ls`; - do - cat ${f} >> ../wordpress.xml - done; -cat wp-bottom.xml >> wordpress.xml \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index f2feeb16..00000000 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,22 +0,0 @@ - - ${title} - http://127.0.0.1/wordpress/?p=${id} - ${date} - admin - http://127.0.0.1/wordpress/?p=${id} - - - - ${id} - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - diff --git a/webmagic-samples/src/main/resources/wp-bottom.xml b/webmagic-samples/src/main/resources/wp-bottom.xml deleted file mode 100644 index f651c3bb..00000000 --- a/webmagic-samples/src/main/resources/wp-bottom.xml +++ /dev/null @@ -1,2 +0,0 @@ - - \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/wp-head.xml b/webmagic-samples/src/main/resources/wp-head.xml deleted file mode 100644 index 8330ba1b..00000000 --- a/webmagic-samples/src/main/resources/wp-head.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - 1.1 - http://127.0.0.1/wordpress - http://127.0.0.1/wordpress - - 1adminflashsword20@163.com - - - http://wordpress.org/?v=3.3.1 diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java deleted file mode 100644 index 0371eb23..00000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiaoyuwengProcessorTest { - - @Ignore - @Test - public void test() throws IOException { - DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); - Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). - run(); - } -} From bfaaa042b90c19baedc72f01f61849705914de29 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 Sep 2013 22:24:48 +0800 Subject: [PATCH 38/39] [maven-release-plugin] prepare release webmagic-parent-0.3.1 --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 4fdfeee1..c889ec7a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.1-SNAPSHOT + 0.3.1 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + webmagic-parent-0.3.1 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index eb4a7514..f8e35d8c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.1-SNAPSHOT + 0.3.1 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 4cdf0010..098bc94f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.1-SNAPSHOT + 0.3.1 4.0.0 From fb693a4ac41667ba70f2d7c11c73b364fa569e67 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 Sep 2013 22:25:07 +0800 Subject: [PATCH 39/39] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index c889ec7a..e2685a8a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.1 + 0.3.2-SNAPSHOT 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.3.1 + HEAD diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f8e35d8c..2506d71a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.1 + 0.3.2-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 098bc94f..7befae77 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.1 + 0.3.2-SNAPSHOT 4.0.0