From d3e527fd6bc7b6281eb479152911b4a17b33ee10 Mon Sep 17 00:00:00 2001 From: "yihua.huang" <code4crafter@gmail.com> Date: Fri, 26 Jul 2013 11:52:23 +0800 Subject: [PATCH] try invite selenium --- pom.xml | 114 +++++++++++++++++- webmagic-core/pom.xml | 73 +---------- webmagic-plugin/pom.xml | 62 +--------- webmagic-samples/pom.xml | 97 ++------------- .../webmagic/samples/IteyeBlogProcessor.java | 5 +- webmagic-selenium/pom.xml | 37 ++++++ 6 files changed, 165 insertions(+), 223 deletions(-) create mode 100644 webmagic-selenium/pom.xml diff --git a/pom.xml b/pom.xml index 39f068c5..f0b3a7d7 100644 --- a/pom.xml +++ b/pom.xml @@ -3,18 +3,65 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <groupId>us.codecraft</groupId> - <version>0.0.1-SNAPSHOT</version> + <version>0.1.0</version> <modelVersion>4.0.0</modelVersion> <packaging>pom</packaging> <artifactId>webmagic</artifactId> <modules> - <module>./webmagic-core</module> - <module>./webmagic-plugin/</module> - <module>./webmagic-samples/</module> - </modules> + <module>webmagic-core</module> + <module>webmagic-plugin/</module> + <module>webmagic-samples/</module> + <module>webmagic-selenium</module> + </modules> - <build> + <dependencyManagement> + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.7</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient</artifactId> + <version>4.2.4</version> + </dependency> + <dependency> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + <version>1.2.17</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + <version>3.1</version> + </dependency> + <dependency> + <groupId>commons-collections</groupId> + <artifactId>commons-collections</artifactId> + <version>3.2.1</version> + </dependency> + <dependency> + <groupId>net.sourceforge.htmlcleaner</groupId> + <artifactId>htmlcleaner</artifactId> + <version>2.4</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-io</artifactId> + <version>1.3.2</version> + </dependency> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.7.2</version> + </dependency> + </dependencies> + </dependencyManagement> + + <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> @@ -25,6 +72,61 @@ <target>1.6</target> </configuration> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <executions> + <execution> + <id>copy-dependencies</id> + <phase>package</phase> + <goals> + <goal>copy-dependencies</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/lib</outputDirectory> + <overWriteReleases>false</overWriteReleases> + <overWriteSnapshots>false</overWriteSnapshots> + <overWriteIfNewer>true</overWriteIfNewer> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-resources-plugin</artifactId> + <configuration> + <encoding>UTF-8</encoding> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + <executions> + <execution> + <id>attach-sources</id> + <goals> + <goal>jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <executions> + <execution> + <id>attach-javadocs</id> + <goals> + <goal>jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-release-plugin</artifactId> + <version>2.0-beta-7</version> + </plugin> </plugins> </build> diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b0de214c..60c37c02 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,8 +2,11 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <groupId>us.codecraft</groupId> - <version>0.1.0</version> + <parent> + <groupId>us.codecraft</groupId> + <artifactId>webmagic</artifactId> + <version>0.1.0</version> + </parent> <modelVersion>4.0.0</modelVersion> <artifactId>webmagic-core</artifactId> @@ -12,109 +15,43 @@ <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> - <version>4.2.4</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.7</version> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <version>13.0.1</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> - <version>3.1</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> - <version>1.2.17</version> </dependency> <dependency> <groupId>commons-collections</groupId> <artifactId>commons-collections</artifactId> - <version>3.2.1</version> </dependency> <dependency> <groupId>net.sourceforge.htmlcleaner</groupId> <artifactId>htmlcleaner</artifactId> - <version>2.4</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> - <version>1.7.2</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-io</artifactId> - <version>1.3.2</version> </dependency> </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>3.1</version> - <configuration> - <source>1.6</source> - <target>1.6</target> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-resources-plugin</artifactId> - <configuration> - <encoding>UTF-8</encoding> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-source-plugin</artifactId> - <executions> - <execution> - <id>attach-sources</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-javadoc-plugin</artifactId> - <executions> - <execution> - <id>attach-javadocs</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-release-plugin</artifactId> - <version>2.0-beta-7</version> - </plugin> - </plugins> - </build> - - </project> \ No newline at end of file diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 634f09d3..b75dc9e7 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -2,8 +2,11 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <groupId>us.codecraft</groupId> - <version>0.1.0</version> + <parent> + <groupId>us.codecraft</groupId> + <artifactId>webmagic</artifactId> + <version>0.1.0</version> + </parent> <modelVersion>4.0.0</modelVersion> <artifactId>webmagic-plugin</artifactId> @@ -12,13 +15,11 @@ <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> - <version>0.1.0</version> + <version>${project.version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.7</version> - <scope>test</scope> </dependency> <dependency> <groupId>org.freemarker</groupId> @@ -32,55 +33,4 @@ </dependency> </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>3.1</version> - <configuration> - <source>1.6</source> - <target>1.6</target> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-resources-plugin</artifactId> - <configuration> - <encoding>UTF-8</encoding> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-source-plugin</artifactId> - <executions> - <execution> - <id>attach-sources</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-javadoc-plugin</artifactId> - <executions> - <execution> - <id>attach-javadocs</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-release-plugin</artifactId> - <version>2.0-beta-7</version> - </plugin> - </plugins> - </build> - - </project> \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ac2092f6..8af7672d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -2,9 +2,11 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - - <groupId>us.codecraft</groupId> - <version>0.1.0</version> + <parent> + <groupId>us.codecraft</groupId> + <artifactId>webmagic</artifactId> + <version>0.1.0</version> + </parent> <modelVersion>4.0.0</modelVersion> <artifactId>webmagic-samples</artifactId> @@ -13,102 +15,17 @@ <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> - <version>0.1.0</version> + <version>${project.version}</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-plugin</artifactId> - <version>0.1.0</version> + <version>${project.version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.7</version> - <scope>test</scope> </dependency> </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>3.1</version> - <configuration> - <source>1.6</source> - <target>1.6</target> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - <executions> - <execution> - <id>copy-dependencies</id> - <phase>package</phase> - <goals> - <goal>copy-dependencies</goal> - </goals> - <configuration> - <outputDirectory>${project.build.directory}/lib</outputDirectory> - <overWriteReleases>false</overWriteReleases> - <overWriteSnapshots>false</overWriteSnapshots> - <overWriteIfNewer>true</overWriteIfNewer> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-resources-plugin</artifactId> - <configuration> - <encoding>UTF-8</encoding> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-source-plugin</artifactId> - <executions> - <execution> - <id>attach-sources</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-javadoc-plugin</artifactId> - <executions> - <execution> - <id>attach-javadocs</id> - <goals> - <goal>jar</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <configuration> - <archive> - <manifest> - <addClasspath>true</addClasspath> - <classpathPrefix>./lib/</classpathPrefix> - <mainClass>us.codecraft.webmagic.samples.DianpingIndexProcessor</mainClass> - </manifest> - </archive> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-release-plugin</artifactId> - <version>2.0-beta-7</version> - </plugin> - </plugins> - </build> - - </project> \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 188f3a1f..76f9cc30 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -26,13 +26,12 @@ public class IteyeBlogProcessor implements PageProcessor { public Site getSite() { if (site == null) { site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31") - .setSleepTime(100).setRetryTimes(3); + setSleepTime(100).setRetryTimes(3); } return site; } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline()).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); } } diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml new file mode 100644 index 00000000..209fbe8f --- /dev/null +++ b/webmagic-selenium/pom.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + <parent> + <groupId>us.codecraft</groupId> + <artifactId>webmagic</artifactId> + <version>0.1.0</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>webmagic-selenium</artifactId> + + <dependencies> + <dependency> + <groupId>us.codecraft</groupId> + <artifactId>webmagic-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>us.codecraft</groupId> + <artifactId>webmagic-plugin</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + <dependency> + <groupId>org.seleniumhq.selenium</groupId> + <artifactId>selenium-java</artifactId> + <version>2.33.0</version> + </dependency> + </dependencies> + + +</project> \ No newline at end of file