diff --git a/pom.xml b/pom.xml index 39f068c5..f0b3a7d7 100644 --- a/pom.xml +++ b/pom.xml @@ -3,18 +3,65 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.0.1-SNAPSHOT + 0.1.0 4.0.0 pom webmagic - ./webmagic-core - ./webmagic-plugin/ - ./webmagic-samples/ - + webmagic-core + webmagic-plugin/ + webmagic-samples/ + webmagic-selenium + - + + + + junit + junit + 4.7 + test + + + org.apache.httpcomponents + httpclient + 4.2.4 + + + log4j + log4j + 1.2.17 + + + org.apache.commons + commons-lang3 + 3.1 + + + commons-collections + commons-collections + 3.2.1 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + + + org.apache.commons + commons-io + 1.3.2 + + + org.jsoup + jsoup + 1.7.2 + + + + + org.apache.maven.plugins @@ -25,6 +72,61 @@ 1.6 + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + false + false + true + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.0-beta-7 + diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b0de214c..60c37c02 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,8 +2,11 @@ - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-core @@ -12,109 +15,43 @@ org.apache.httpcomponents httpclient - 4.2.4 junit junit - 4.7 - test - - - - com.google.guava - guava - 13.0.1 org.apache.commons commons-lang3 - 3.1 log4j log4j - 1.2.17 commons-collections commons-collections - 3.2.1 net.sourceforge.htmlcleaner htmlcleaner - 2.4 org.jsoup jsoup - 1.7.2 org.apache.commons commons-io - 1.3.2 - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 634f09d3..b75dc9e7 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -2,8 +2,11 @@ - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-plugin @@ -12,13 +15,11 @@ us.codecraft webmagic-core - 0.1.0 + ${project.version} junit junit - 4.7 - test org.freemarker @@ -32,55 +33,4 @@ - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ac2092f6..8af7672d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -2,9 +2,11 @@ - - us.codecraft - 0.1.0 + + us.codecraft + webmagic + 0.1.0 + 4.0.0 webmagic-samples @@ -13,102 +15,17 @@ us.codecraft webmagic-core - 0.1.0 + ${project.version} us.codecraft webmagic-plugin - 0.1.0 + ${project.version} junit junit - 4.7 - test - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.1 - - 1.6 - 1.6 - - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - - - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - attach-javadocs - - jar - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - true - ./lib/ - us.codecraft.webmagic.samples.DianpingIndexProcessor - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - - \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 188f3a1f..76f9cc30 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -26,13 +26,12 @@ public class IteyeBlogProcessor implements PageProcessor { public Site getSite() { if (site == null) { site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31") - .setSleepTime(100).setRetryTimes(3); + setSleepTime(100).setRetryTimes(3); } return site; } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline()).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); } } diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml new file mode 100644 index 00000000..209fbe8f --- /dev/null +++ b/webmagic-selenium/pom.xml @@ -0,0 +1,37 @@ + + + + + us.codecraft + webmagic + 0.1.0 + + 4.0.0 + webmagic-selenium + + + + us.codecraft + webmagic-core + ${project.version} + + + us.codecraft + webmagic-plugin + ${project.version} + + + junit + junit + + + org.seleniumhq.selenium + selenium-java + 2.33.0 + + + + + \ No newline at end of file