Merge branch 'release/1.0.0'

pull/670/merge WebMagic-1.0.0
Joe Zhou 7 months ago
commit 9d75cce16d

82
.gitignore vendored

@ -1,9 +1,77 @@
target
*.iml
out/
.idea
.classpath
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
.mvn/wrapper/maven-wrapper.jar
# Eclipse m2e generated files
# Eclipse Core
.project
.settings/
# JDT-specific (Eclipse Java Development Tools)
.classpath
.metadata
bin/
.myeclipse
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# CDT- autotools
.autotools
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Annotation Processing
.apt_generated/
.apt_generated_test/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
# Uncomment this line if you wish to ignore the project description file.
# Typically, this file would be tracked if it contains build/dependency configurations:
#.project

@ -1,14 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.oxerr</groupId>
<artifactId>oxerr-parent</artifactId>
<version>2.2.1</version>
</parent>
<groupId>us.codecraft</groupId>
<version>1.0.0</version>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<assertj.version>3.23.1</assertj.version>
<commons-cli.version>1.5.0</commons-cli.version>
<commons-collections4.version>4.4</commons-collections4.version>
@ -23,20 +33,21 @@
<jedis.version>3.7.1</jedis.version>
<jruby.version>9.3.9.0</jruby.version>
<json-path.version>2.9.0</json-path.version>
<junit.version>4.13.2</junit.version>
<junit.version>5.10.2</junit.version>
<junit.platform.version>1.10.2</junit.platform.version>
<jython.version>2.7.3</jython.version>
<log4j.version>1.2.17</log4j.version>
<log4j2.version>2.23.1</log4j2.version>
<mockito-all.version>2.0.2-beta</mockito-all.version>
<moco.version>1.3.0</moco.version>
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
<saxon-he.version>11.4</saxon-he.version>
<selenium-java.version>3.141.59</selenium-java.version>
<saxon-he.version>12.4</saxon-he.version>
<selenium-java.version>4.14.1</selenium-java.version>
<slf4j.version>2.0.4</slf4j.version>
<spring-version>4.0.0.RELEASE</spring-version>
<xsoup.version>0.3.5</xsoup.version>
</properties>
<artifactId>webmagic-parent</artifactId>
<name>webmagic-parent</name>
<artifactId>webmagic</artifactId>
<name>webmagic</name>
<description>
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
extraction and persistent. It can simply the development of a specific crawler.
@ -77,14 +88,41 @@
<module>webmagic-coverage</module>
</modules>
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-launcher</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
@ -101,6 +139,16 @@
<artifactId>httpcore</artifactId>
<version>${httpcore.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
@ -112,13 +160,28 @@
<version>${json-path.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-launcher</artifactId>
<version>${junit.platform.version}</version>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<version>${junit.platform.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
@ -143,11 +206,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
@ -219,86 +277,10 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.5.0</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<!--<plugin>-->
<!--<groupId>org.apache.maven.plugins</groupId>-->
<!--<artifactId>maven-dependency-plugin</artifactId>-->
<!--<version>2.8</version>-->
<!--<executions>-->
<!--<execution>-->
<!--<id>copy-dependencies</id>-->
<!--<phase>package</phase>-->
<!--<goals>-->
<!--<goal>copy-dependencies</goal>-->
<!--</goals>-->
<!--<configuration>-->
<!--<outputDirectory>${project.build.directory}/lib</outputDirectory>-->
<!--<overWriteReleases>false</overWriteReleases>-->
<!--<overWriteSnapshots>false</overWriteSnapshots>-->
<!--<overWriteIfNewer>true</overWriteIfNewer>-->
<!--</configuration>-->
<!--</execution>-->
<!--</executions>-->
<!--</plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<excludes>
<exclude>log4j.xml</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.4.1</version>
<configuration>
<encoding>UTF-8</encoding>
<doctitle>WebMagic ${project.version}</doctitle>
<locale>en_US</locale>
@ -322,11 +304,6 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.0-M6</version>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
@ -355,189 +332,6 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.10.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>3.0.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
<version>3.3.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.19.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.3.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>4.0.0-M3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M7</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
<version>3.0.0-M7</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.8</version>
</plugin>
<plugin>
<groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId>
<version>1.18.0</version>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
<version>4.7.2.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<configuration>
<doclint>none</doclint>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
</plugin>
</plugins>
</reporting>
<profiles>
<profile>
<id>release</id>
<build>
<plugins>
<!-- Source -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- Javadoc -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.4.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- GPG -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>sonatype-nexus-staging</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
</plugins>
</build>
<distributionManagement>
<snapshotRepository>
<id>sonatype-nexus-snapshots</id>
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
</snapshotRepository>
<repository>
<id>sonatype-nexus-staging</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
</profile>
</profiles>
</project>

@ -1,9 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -15,11 +20,6 @@
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
@ -45,12 +45,6 @@
<artifactId>mockito-all</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>

@ -71,6 +71,7 @@ public class Page {
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @param request the {@link Request}.
* @return the page.
* @since 0.10.0
*/

@ -1,13 +1,14 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
/**
* Object contains url to crawl.<br>
* It contains some additional information.<br>
@ -35,7 +36,7 @@ public class Request implements Serializable {
/**
* Store additional information in extras.
*/
private Map<String, Object> extras;
private Map<String, Object> extras = new HashMap<>();
/**
* cookies for current url, if not set use Site's cookies
@ -93,9 +94,6 @@ public class Request implements Serializable {
}
public <T> Request putExtra(String key, T value) {
if (extras == null) {
extras = new HashMap<String, Object>();
}
extras.put(key, value);
return this;
}
@ -105,11 +103,11 @@ public class Request implements Serializable {
}
public Map<String, Object> getExtras() {
return extras;
return Collections.unmodifiableMap(extras);
}
public Request setExtras(Map<String, Object> extras) {
this.extras = extras;
this.extras.putAll(extras);
return this;
}

@ -9,11 +9,8 @@ import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
@ -75,9 +72,9 @@ public class Spider implements Runnable, Task {
protected Site site;
protected String uuid;
protected Scheduler scheduler = new QueueScheduler();
protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass());
protected CountableThreadPool threadPool;
@ -88,7 +85,7 @@ public class Spider implements Runnable, Task {
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected boolean exitWhenComplete = true;
protected volatile boolean exitWhenComplete = true;
protected final static int STAT_INIT = 0;
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
}
/**
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @param updateScheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler scheduler) {
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
SpiderScheduler oldScheduler = this.scheduler;
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
@ -213,7 +207,7 @@ public class Spider implements Runnable, Task {
* @deprecated
*/
@Deprecated
public Spider pipeline(Pipeline pipeline) {
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
@ -264,7 +258,7 @@ public class Spider implements Runnable, Task {
* @deprecated
*/
@Deprecated
public Spider downloader(Downloader downloader) {
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
@ -333,10 +327,10 @@ public class Spider implements Runnable, Task {
}
} else {
// wait until new url added
if (waitNewUrl()) {
//if interrupted
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
// if interrupted
break;
}
}
continue;
}
}
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
scheduler.signalNewUrl();
}
}
});
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}
/**
*
* @return isInterrupted
*/
private boolean waitNewUrl() {
// now there may not be any thread live
newUrlLock.lock();
try {
//double checkunnecessary, unless very fast concurrent
if (threadPool.getThreadAlive() == 0) {
return false;
}
//wait for amount of time
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
// logger.warn("waitNewUrl - interrupted, error {}", e);
return true;
} finally {
newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
public void start() {
runAsync();
}
@ -636,6 +598,13 @@ public class Spider implements Runnable, Task {
}
}
/**
* Stop when all tasks in the queue are completed and all worker threads are also completed
*/
public void stopWhenComplete(){
this.exitWhenComplete = true;
}
/**
* start with more than one threads
*
@ -799,7 +768,7 @@ public class Spider implements Runnable, Task {
}
public Scheduler getScheduler() {
return scheduler;
return scheduler.getScheduler();
}
/**

@ -0,0 +1,59 @@
package us.codecraft.webmagic;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();
public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Request poll(Spider spider) {
return scheduler.poll(spider);
}
public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}
public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
}

@ -42,7 +42,9 @@ public class HttpUriRequestConverter {
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null && proxy.getUsername() != null) {
AuthState authState = new AuthState();
authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
authState.update(proxyAuthScheme, proxyCredentials);
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
if (request.getCookies() != null && !request.getCookies().isEmpty()) {

@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
return elements;
}
@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());

@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable links() {
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");

@ -51,14 +51,6 @@ public interface Selectable {
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);
/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public Selectable smartContent();
/**
* select all links
*

@ -21,6 +21,10 @@ public abstract class CharsetUtils {
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
private CharsetUtils() {
throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!");
}
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset

@ -116,6 +116,10 @@ public class UrlUtils {
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
public static String getCharset(String contentType) {
if (contentType == null) {
return null;
}
Matcher matcher = patternForCharset.matcher(contentType);
if (matcher.find()) {
String charset = matcher.group(1);

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -1,9 +1,13 @@
package us.codecraft.webmagic;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.Collections;
import java.util.Map;
import org.junit.Test;
import us.codecraft.webmagic.utils.HttpConstant;
import static org.assertj.core.api.Assertions.assertThat;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* @author code4crafter@gmail.com
@ -22,4 +26,28 @@ public class RequestTest {
assertThat(requestA).isNotEqualTo(requestB);
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
}
@Test
public void testSetExtras() {
Request request = new Request();
Map<String, Object> extras = Collections.singletonMap("a", "1");
request.setExtras(extras);
request.putExtra("b", "2");
assertThat(request.<String>getExtra("a")).isEqualTo("1");
assertThat(request.<String>getExtra("b")).isEqualTo("2");
}
@Test
public void testGetExtras() {
Request request = new Request();
request.putExtra("a", "1");
assertThat(request.getExtras()).containsEntry("a", "1");
}
@Test(expected = UnsupportedOperationException.class)
public void testGetExtrasShouldBeUnmodifiable() {
Request request = new Request();
request.getExtras().put("a", "1");
}
}

@ -1,8 +1,12 @@
package us.codecraft.webmagic;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
@ -14,4 +18,23 @@ public class SiteTest {
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
}
@Test
public void addCookieTest(){
Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
site.addCookie("cookieDefault","cookie-webmagicDefault");
String firstDomain="example.com";
String secondDomain="exampleCopy.com";
site.addCookie(firstDomain, "cookie", "cookie-webmagic");
site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy");
site.addCookie(secondDomain, "cookie", "cookie-webmagic");
Map<String, Map<String, String>> allCookies = site.getAllCookies();
List<String> domains=new ArrayList<>();
for(String key : allCookies.keySet()){
domains.add(key);
}
assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie"));
assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy"));
assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie"));
assertEquals(2, domains.size());
}
}

@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.uri;
import static com.github.dreamhead.moco.Moco.with;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;
/**
@ -333,5 +334,13 @@ public class HttpClientDownloaderTest {
});
}
@Test
public void test_no_task_download(){
Request request = new Request();
request.setUrl("http://127.0.0.1:13423/");
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null));
}
}

@ -8,19 +8,19 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
*
*/
public class ProxyTest {
class ProxyTest {
private static List<String[]> httpProxyList = new ArrayList<String[]>();
@BeforeClass
public static void before() {
@BeforeAll
static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
@ -48,7 +48,7 @@ public class ProxyTest {
}
@Test
public void testCreate() {
void testCreate() {
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertNull(proxy.getUsername());
@ -86,7 +86,15 @@ public class ProxyTest {
}
@Test
public void testToString() {
void testEqualsHashCode() {
var proxy0 = new Proxy("::1", 1080);
var proxy1 = new Proxy("::1", 1080);
assertEquals(proxy0, proxy1);
assertEquals(proxy0.hashCode(), proxy1.hashCode());
}
@Test
void testToString() {
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());

@ -0,0 +1,59 @@
package us.codecraft.webmagic.selector;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class AndSelectorTest {
@Test
public void testSelectList() {
String htmlContent = "<!DOCTYPE html>\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
" <title>HTML with XPath</title>\n" +
"</head>\n" +
"<body>\n" +
" <div class=\"container\">\n" +
" <div class=\"item1\">Item 1</div>\n" +
" <div class=\"item2\">Item 2</div>\n" +
" </div>\n" +
"</body>\n" +
"</html>";
List<Selector> selectors = new ArrayList<Selector>();
selectors.add(new CssSelector("div"));
selectors.add(new XpathSelector("//div[@class='item1']"));
AndSelector andSelector = new AndSelector(selectors);
List<String> result = andSelector.selectList(htmlContent);
assertEquals("<div class=\"item1\">\n Item 1\n</div>", result.get(0));
}
@Test
public void testSelectList_NoResults() {
String htmlContent = "<!DOCTYPE html>\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
" <title>HTML with XPath</title>\n" +
"</head>\n" +
"<body>\n" +
" <div class=\"container\">\n" +
" <div class=\"item1\">Item 1</div>\n" +
" <div class=\"item2\">Item 2</div>\n" +
" </div>\n" +
"</body>\n" +
"</html>";
List<Selector> selectors = new ArrayList<Selector>();
selectors.add(new CssSelector("div"));
selectors.add(new XpathSelector("//div[@class='item']"));
AndSelector andSelector = new AndSelector(selectors);
List<String> result = andSelector.selectList(htmlContent);
assertEquals(0, result.size());
}
}

@ -0,0 +1,39 @@
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.runners.MockitoJUnitRunner;
import java.util.List;
import static org.junit.Assert.*;
public class CssSelectorTest {
@Test
public void testSelectElement() {
CssSelector cssSelector = new CssSelector("div");
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
Document doc = Jsoup.parse(htmlContent);
Element dummyElement = doc.getElementById("dummyDiv");
Element resultElement = cssSelector.selectElement(dummyElement);
assertNotNull(resultElement);
}
@Test
public void testSelectList() {
CssSelector cssSelector = new CssSelector("div");
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
Document doc = Jsoup.parse(htmlContent);
Element dummyElement = doc.getElementById("dummyDiv");
List<String> result = cssSelector.selectList(dummyElement);
assertEquals(1, result.size());
assertEquals("[<div id=\"dummyDiv\">\n Hello World!\n</div>]", result.toString());
}
}

@ -0,0 +1,44 @@
package us.codecraft.webmagic.selector;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class OrSelectorTest {
@Test
public void testSelectList() {
String htmlContent = "<!DOCTYPE html>\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
" <title>HTML with XPath</title>\n" +
"</head>\n" +
"<body>\n" +
" <div class=\"container\">\n" +
" <div class=\"item1\">Item 1</div>\n" +
" <div class=\"item2\">Item 2</div>\n" +
" </div>\n" +
"</body>\n" +
"</html>";
String expectedResult = "[<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
" <title>HTML with XPath</title>\n" +
"</head>, <div class=\"item1\">\n" +
" Item 1\n" +
"</div>, <div class=\"item2\">\n" +
" Item 2\n" +
"</div>]";
List<Selector> selectors = new ArrayList<Selector>();
selectors.add(new CssSelector("head"));
selectors.add(new XpathSelector("//div[@class='item1']"));
selectors.add(new XpathSelector("//div[@class='item2']"));
OrSelector orSelector = new OrSelector(selectors);
List<String> result = orSelector.selectList(htmlContent);
assertEquals(expectedResult, result.toString());
}
}

@ -0,0 +1,16 @@
package us.codecraft.webmagic.utils;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.IOException;
import org.junit.jupiter.api.Test;
class CharsetUtilsTest {
@Test
void testDetectCharset() throws IOException {
assertNull(CharsetUtils.detectCharset(null, new byte[0]));
}
}

@ -1,5 +1,7 @@
package us.codecraft.webmagic.utils;
import static org.junit.Assert.assertNull;
import org.junit.Assert;
import org.junit.Test;
@ -43,5 +45,9 @@ public class UrlUtilsTest {
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
}
@Test
public void testGetCharset() {
assertNull(UrlUtils.getCharset(null));
}
}

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration>
<Appenders>
<Console name="stdout" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache" level="warn" additivity="false">
<AppenderRef ref="stdout" />
</Logger>
<Root level="info">
<AppenderRef ref="stdout" />
</Root>
</Loggers>
</Configuration>

@ -1,14 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>webmagic-coverage</artifactId>

@ -1,15 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-extension</artifactId>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
@ -29,10 +40,6 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>

@ -1,5 +1,9 @@
package us.codecraft.webmagic.model;
import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
/**
@ -7,18 +11,18 @@ import us.codecraft.webmagic.selector.Selector;
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
class Extractor {
public class Extractor {
@Getter @Setter
protected Selector selector;
@Getter
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
static enum Source {Html, Url, RawHtml, RawText}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
@ -26,23 +30,11 @@ class Extractor {
this.multi = multi;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
boolean isNotNull() {
public boolean isNotNull() {
return notNull;
}
boolean isMulti() {
public boolean isMulti() {
return multi;
}
void setSelector(Selector selector) {
this.selector = selector;
}
}

@ -1,58 +1,33 @@
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import lombok.Getter;
import lombok.Setter;
/**
* Wrapper of field and extractor.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
class FieldExtractor extends Extractor {
public class FieldExtractor extends Extractor {
@Getter
private final Field field;
@Getter @Setter
private Method setterMethod;
@Getter @Setter
private ObjectFormatter objectFormatter;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi);
this.field = field;
}
Field getField() {
return field;
}
Selector getSelector() {
return selector;
}
Source getSource() {
return source;
}
void setSetterMethod(Method setterMethod) {
this.setterMethod = setterMethod;
}
Method getSetterMethod() {
return setterMethod;
}
boolean isNotNull() {
return notNull;
}
ObjectFormatter getObjectFormatter() {
return objectFormatter;
}
void setObjectFormatter(ObjectFormatter objectFormatter) {
this.objectFormatter = objectFormatter;
}
}

@ -3,17 +3,21 @@ package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
*/
class PageModelExtractor {
@Getter
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
@Getter
private Selector targetUrlRegionSelector;
@Getter
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
@Getter
private Selector helpUrlRegionSelector;
@Getter
private Class clazz;
private List<FieldExtractor> fieldExtractors;
@ -86,7 +95,7 @@ class PageModelExtractor {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@ -112,7 +121,7 @@ class PageModelExtractor {
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@ -127,26 +136,23 @@ class PageModelExtractor {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
ExtractBy.Source source0 = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath){
source0 = RawText;
}
FieldExtractor.Source source = null;
switch (source0){
ExtractBy.Source extractSource = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath)
extractSource = RawText;
Source source = null;
switch (extractSource) {
case RawText:
source = FieldExtractor.Source.RawText;
source = new RawText();
break;
case RawHtml:
source = FieldExtractor.Source.RawHtml;
source = new RawHtml();
break;
case SelectedHtml:
source =FieldExtractor.Source.Html;
source = new SelectedHtml();
break;
default:
source =FieldExtractor.Source.Html;
source = new SelectedHtml();
}
fieldExtractor = new FieldExtractor(field, selector, source,
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
@ -193,7 +199,7 @@ class PageModelExtractor {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
}
}
@ -233,135 +239,15 @@ class PageModelExtractor {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
if (fieldExtractor.isMulti()) {
List<String> value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().selectList(html);
}
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().selectList(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().selectList(html);
}
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = convert(value, fieldExtractor.getObjectFormatter());
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
} else {
String value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().select(html);
}
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().select(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().select(html);
}
if (value == null && fieldExtractor.isNotNull()) {
return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
Object converted = convert(value, fieldExtractor.getObjectFormatter());
if (converted == null && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
}
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
if (!field.operation(o, fieldExtractor, logger))
return null;
}
if (AfterExtractor.class.isAssignableFrom(clazz)) {
if (AfterExtractor.class.isAssignableFrom(clazz))
((AfterExtractor) o).afterProcess(page);
}
} catch (InstantiationException e) {
logger.error("extract fail", e);
} catch (IllegalAccessException e) {
logger.error("extract fail", e);
} catch (InvocationTargetException e) {
} catch (Exception e) {
logger.error("extract fail", e);
}
return o;
}
private Object convert(String value, ObjectFormatter objectFormatter) {
try {
Object format = objectFormatter.format(value);
logger.debug("String {} is converted to {}", value, format);
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter) {
List<Object> objects = new ArrayList<Object>();
for (String value : values) {
Object converted = convert(value, objectFormatter);
if (converted != null) {
objects.add(converted);
}
}
return objects;
}
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value == null) {
return;
}
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}
Class getClazz() {
return clazz;
}
List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}
List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}
Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}
Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
}

@ -0,0 +1,42 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import lombok.Getter;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
public class MultipleField extends PageField {
@Getter
private List<String> fieldNames;
public MultipleField(List<String> fieldNames) {
this.fieldNames = fieldNames;
}
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
return false;
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
setField(o, fieldExtractor, converted);
}
else
setField(o, fieldExtractor, this.fieldNames);
return true;
}
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
List<Object> objects = new ArrayList<>();
for (String value : values) {
Object converted = this.convert(value, objectFormatter, logger);
if (converted != null)
objects.add(converted);
}
return objects;
}
}

@ -0,0 +1,31 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import org.slf4j.Logger;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
public abstract class PageField {
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
try {
Object format = objectFormatter.format(value);
logger.debug("String {} is converted to {}", value, format);
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value != null) {
if (fieldExtractor.getSetterMethod() != null)
fieldExtractor.getSetterMethod().invoke(o, value);
fieldExtractor.getField().set(o, value);
}
}
}

@ -0,0 +1,28 @@
package us.codecraft.webmagic.model.fields;
import java.lang.reflect.InvocationTargetException;
import org.slf4j.Logger;
import lombok.Getter;
import us.codecraft.webmagic.model.FieldExtractor;
public class SingleField extends PageField {
@Getter
private String fieldName;
public SingleField(String fieldName) {
this.fieldName = fieldName;
}
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getObjectFormatter() != null) {
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
if (converted == null && fieldExtractor.isNotNull())
return false;
setField(o, fieldExtractor, converted);
} else
setField(o, fieldExtractor, this.fieldName);
return true;
}
}

@ -0,0 +1,85 @@
package us.codecraft.webmagic.model.formatter;
public interface BasicClassDetector {
Class<?> detectBasicClass(Class<?> type);
}
class IntegerClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
}
return null;
}
}
class LongClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
}
return null;
}
}
class DoubleClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
}
return null;
}
}
class FloatClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
}
return null;
}
}
class ShortClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
}
return null;
}
}
class CharacterClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
}
return null;
}
}
class ByteClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
}
return null;
}
}
class BooleanClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
}
return null;
}
}

@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
}
protected abstract T formatTrimmed(String raw) throws Exception;
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
new LongClassDetector(),
new FloatClassDetector(),
new DoubleClassDetector(),
new ShortClassDetector(),
new ByteClassDetector(),
new BooleanClassDetector(),
new CharacterClassDetector());
public static Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
for (BasicClassDetector detector : basicClassDetector) {
Class<?> detectedClass = detector.detectBasicClass(type);
if (detectedClass != null) {
return detectedClass;
}
}
return type;
}
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
}
}
}

@ -0,0 +1,68 @@
package us.codecraft.webmagic.model.sources;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
public interface Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
public class RawHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocument(fieldExtractor.getSelector());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
}
}
public class SelectedHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocument(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().select(html);
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().selectList(html);
}
}
public class Url implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getUrl().toString());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
}
}
public class RawText implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getRawText());
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getRawText());
}
}
public class DefaultSource implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(html);
}
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(html);
}
}
}

@ -0,0 +1,17 @@
package us.codecraft.webmagic.model.sources;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.MultipleField;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.fields.SingleField;
public class SourceTextExtractor {
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
Source source = fieldExtractor.getSource();
if (fieldExtractor.isMulti())
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
else
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
}
}

@ -102,7 +102,7 @@ public class RedisPriorityScheduler extends RedisScheduler {
}
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
if (request.getExtras() != null) {
if (!request.getExtras().isEmpty()) {
String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset(getItemKey(task), field, value);

@ -84,7 +84,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
return true;
}
if (request.getExtras() != null && !request.getExtras().isEmpty()) {
if (!request.getExtras().isEmpty()) {
return true;
}
if (request.getPriority() != 0L) {

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -13,7 +13,6 @@ import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ConfigurablePageProcessorTest {

@ -12,7 +12,6 @@ import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-4
*/
public class ModelPageProcessorTest {

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration>
<Appenders>
<Console name="stdout" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache" level="warn" additivity="false">
<AppenderRef ref="stdout" />
</Logger>
<Root level="info">
<AppenderRef ref="stdout" />
</Root>
</Loggers>
</Configuration>

@ -1,9 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -20,10 +25,6 @@
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.mapdb</groupId>
<artifactId>mapdb</artifactId>
@ -42,7 +43,7 @@
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
<version>2.16.0</version>
</dependency>
</dependencies>

@ -1,26 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration>
<Appenders>
<Console name="stdout" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</Console>
</Appenders>
<Loggers>
<Logger name="org.springframework" level="warn" additivity="false">
<AppenderRef ref="stdout" />
</Logger>
<Logger name="net.sf.ehcache" level="warn" additivity="false">
<AppenderRef ref="stdout" />
</Logger>
<Root level="info">
<AppenderRef ref="stdout" />
</Root>
</Loggers>
</Configuration>

@ -1,14 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-saxon</artifactId>
<properties>
<maven.deploy.skip>true</maven.deploy.skip>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
@ -23,23 +32,6 @@
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>

@ -1,9 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -13,6 +18,14 @@
</properties>
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
</dependency>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
@ -30,25 +43,22 @@
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
@ -90,4 +100,4 @@
</build>
</project>
</project>

@ -0,0 +1,47 @@
package us.codecraft.webmagic.scripts;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.scripts.languages.JRuby;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Language;
import us.codecraft.webmagic.utils.WMCollections;
public class Params {
@Getter
Language language = new Javascript();
@Getter @Setter
String scriptFileName;
@Getter @Setter
List<String> urls;
@Getter @Setter
int thread = 1;
@Getter @Setter
int sleepTime = 1000;
private static Map<Language, Set<String>> alias;
public Params() {
alias = new HashMap<Language, Set<String>>();
alias.put(new Javascript(), WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(new JRuby(), WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
if (languageSetEntry.getValue().contains(arg)) {
this.language = languageSetEntry.getKey();
return;
}
}
}
}

@ -1,88 +1,21 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.cli.*;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.scripts.config.CommandLineOption;
import us.codecraft.webmagic.utils.WMCollections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @author code4crafter@gmail.com
* @author code4crafter@gmail.com / FrancoisGib
* @since 0.4.1
*/
public class ScriptConsole {
private static class Params {
Language language = Language.JavaScript;
String scriptFileName;
List<String> urls;
int thread = 1;
int sleepTime = 1000;
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
static {
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
if (languageSetEntry.getValue().contains(arg)) {
this.language = languageSetEntry.getKey();
return;
}
}
}
private Language getLanguage() {
return language;
}
private void setLanguage(Language language) {
this.language = language;
}
private String getScriptFileName() {
return scriptFileName;
}
private void setScriptFileName(String scriptFileName) {
this.scriptFileName = scriptFileName;
}
private List<String> getUrls() {
return urls;
}
private void setUrls(List<String> urls) {
this.urls = urls;
}
private int getThread() {
return thread;
}
private void setThread(int thread) {
this.thread = thread;
}
private int getSleepTime() {
return sleepTime;
}
private void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
}
public static void main(String[] args) {
Params params = parseCommand(args);
startSpider(params);
@ -140,45 +73,9 @@ public class ScriptConsole {
private static Params readOptions(CommandLine commandLine) {
Params params = new Params();
if (commandLine.hasOption("l")) {
String language = commandLine.getOptionValue("l");
params.setLanguagefromArg(language);
}
if (commandLine.hasOption("f")) {
String scriptFilename = commandLine.getOptionValue("f");
params.setScriptFileName(scriptFilename);
} else {
exit();
}
if (commandLine.hasOption("s")) {
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
params.setSleepTime(sleepTime);
}
if (commandLine.hasOption("t")) {
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
params.setThread(thread);
}
if (commandLine.hasOption("g")) {
configLogger(commandLine.getOptionValue("g"));
}
params.setUrls(commandLine.getArgList());
List<CommandLineOption> options = CommandLineOption.getAllOptions();
for (CommandLineOption option : options)
option.addParamOptionIfInCommandLine(params, commandLine);
return params;
}
private static void configLogger(String value) {
Logger rootLogger = Logger.getRootLogger();
if ("debug".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.DEBUG);
} else if ("info".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.INFO);
} else if ("warn".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.WARN);
} else if ("trace".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.TRACE);
} else if ("off".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.OFF);
} else if ("error".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.ERROR);
}
}
}
}

@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import us.codecraft.webmagic.scripts.languages.Language;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger;
*/
public class ScriptEnginePool {
private final int size;
private final AtomicInteger availableCount;
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
public ScriptEnginePool(Language language,int size) {
this.size = size;
this.availableCount = new AtomicInteger(size);
for (int i=0;i<size;i++){
ScriptEngineManager manager = new ScriptEngineManager();

@ -4,17 +4,14 @@ package us.codecraft.webmagic.scripts;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash;
import org.python.core.PyDictionary;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scripts.languages.Language;
/**
* @author code4crafter@gmail.com
@ -55,35 +52,7 @@ public class ScriptProcessor implements PageProcessor {
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
switch (language) {
case JavaScript:
engine.eval(defines + "\n" + script, context);
// NativeObject o = (NativeObject) engine.get("result");
// if (o != null) {
// for (Object o1 : o.getIds()) {
// String key = String.valueOf(o1);
// page.getResultItems().put(key, NativeObject.getProperty(o, key));
// }
// }
break;
case JRuby:
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, context);
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry) itruby.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
case Jython:
engine.eval(defines + "\n" + script, context);
PyDictionary oJython = (PyDictionary) engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry) it.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
break;
}
this.language.process(engine, defines, script, page);
} catch (ScriptException e) {
e.printStackTrace();
}

@ -7,6 +7,9 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Language;
/**
* @author code4crafter@gmail.com
@ -14,7 +17,7 @@ import org.apache.commons.io.IOUtils;
*/
public class ScriptProcessorBuilder {
private static final Language DefaultLanguage = Language.JavaScript;
private static final Language DefaultLanguage = new Javascript();
private Language language = DefaultLanguage;
@ -39,7 +42,6 @@ public class ScriptProcessorBuilder {
InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
@ -50,7 +52,6 @@ public class ScriptProcessorBuilder {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset());
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;

@ -0,0 +1,82 @@
package us.codecraft.webmagic.scripts.config;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import lombok.Getter;
import us.codecraft.webmagic.scripts.Params;
public abstract class CommandLineOption {
@Getter
char option;
public CommandLineOption(char option) {
this.option = option;
}
protected abstract void addParamOption(Params params, CommandLine commandLine);
public void addParamOptionIfInCommandLine(Params params, CommandLine commandLine) {
if (commandLine.hasOption(this.option))
this.addParamOption(params, commandLine);
}
public static List<CommandLineOption> getAllOptions() {
return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
}
}
class OptionL extends CommandLineOption {
public OptionL() {
super('l');
}
protected void addParamOption(Params params, CommandLine commandLine) {
String language = commandLine.getOptionValue("l");
params.setLanguagefromArg(language);
}
}
class OptionF extends CommandLineOption {
public OptionF() {
super('f');
}
protected void addParamOption(Params params, CommandLine commandLine) {
String scriptFilename = commandLine.getOptionValue("f");
params.setScriptFileName(scriptFilename);
}
}
class OptionS extends CommandLineOption {
public OptionS() {
super('s');
}
protected void addParamOption(Params params, CommandLine commandLine) {
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
params.setSleepTime(sleepTime);
}
}
class OptionT extends CommandLineOption {
public OptionT() {
super('t');
}
protected void addParamOption(Params params, CommandLine commandLine) {
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
params.setThread(thread);
}
}
class OptionG extends CommandLineOption {
public OptionG() {
super('g');
}
protected void addParamOption(Params params, CommandLine commandLine) {
ConfigLogger.configLogger(commandLine.getOptionValue("g"));
}
}

@ -0,0 +1,34 @@
package us.codecraft.webmagic.scripts.config;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.Logger;
import org.slf4j.LoggerFactory;
public class ConfigLogger {
/**
* Log the config parameter. If the counter is less than the number of available
* options then it means that the user entered an option
*
* @param value The config string
*/
public static void configLogger(String value) {
List<Pair<String, Level>> options = List.of(
Pair.of("debug", Level.DEBUG),
Pair.of("info", Level.INFO),
Pair.of("warn", Level.WARN),
Pair.of("trace", Level.TRACE),
Pair.of("off", Level.OFF),
Pair.of("error", Level.ERROR));
Pair<String, Level> option = options.get(0);
int i = 1;
while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
option = options.get(i++);
if (i < options.size()) {
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
rootLogger.setLevel(option.getRight());
}
}
}

@ -0,0 +1,26 @@
package us.codecraft.webmagic.scripts.languages;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.jruby.RubyHash;
import us.codecraft.webmagic.Page;
public class JRuby extends Language {
public JRuby() {
super("jruby","ruby/defines.rb","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry) itruby.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
}
}

@ -0,0 +1,16 @@
package us.codecraft.webmagic.scripts.languages;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import us.codecraft.webmagic.Page;
public class Javascript extends Language {
public Javascript() {
super("javascript","js/defines.js","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
engine.eval(defines + "\n" + script, engine.getContext());
}
}

@ -0,0 +1,27 @@
package us.codecraft.webmagic.scripts.languages;
import java.util.Iterator;
import java.util.Map;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import org.python.core.PyDictionary;
import us.codecraft.webmagic.Page;
public class Jython extends Language {
public Jython() {
super("jython","python/defines.py","");
}
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
engine.eval(defines + "\n" + script, engine.getContext());
PyDictionary oJython = (PyDictionary) engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry) it.next();
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
}
}
}

@ -1,15 +1,18 @@
package us.codecraft.webmagic.scripts;
package us.codecraft.webmagic.scripts.languages;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import us.codecraft.webmagic.Page;
/**
* @author code4crafter@gmail.com
* @author FrancoisGib
*/
public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb",""),
Jython("jython","python/defines.py","");
public abstract class Language {
public Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
private String engineName;
@ -17,12 +20,6 @@ public enum Language {
private String gatherFile;
Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
public String getEngineName() {
return engineName;
}
@ -34,4 +31,6 @@ public enum Language {
public String getGatherFile() {
return gatherFile;
}
public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException;
}

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="error" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scripts.languages.JRuby;
import us.codecraft.webmagic.scripts.languages.Javascript;
import us.codecraft.webmagic.scripts.languages.Jython;
/**
* @author code4crafter@gmail.com
@ -13,14 +17,14 @@ public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@ -28,7 +32,7 @@ public class ScriptProcessorTest {
@Test
public void testPythonProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration>
<Appenders>
<Console name="stdout" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache" level="warn" additivity="false">
<AppenderRef ref="stdout" />
</Logger>
<Root level="debug">
<AppenderRef ref="stdout" />
</Root>
</Loggers>
</Configuration>

@ -1,45 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-selenium</artifactId>
<artifactId>webmagic-selenium</artifactId>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>

@ -1,15 +1,5 @@
package us.codecraft.webmagic.downloader.selenium;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
@ -22,6 +12,18 @@ import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
@ -58,7 +60,7 @@ class WebDriverPool {
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
@ -73,7 +75,6 @@ class WebDriverPool {
// Prepare capabilities
sCaps = new DesiredCapabilities();
sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
@ -134,9 +135,9 @@ class WebDriverPool {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
mDriver = new FirefoxDriver(new FirefoxOptions(sCaps));
} else if (driver.equals(DRIVER_CHROME)) {
mDriver = new ChromeDriver(sCaps);
mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps));
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
@ -144,7 +145,7 @@ class WebDriverPool {
/**
* check whether input is a valid URL
*
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
@ -178,7 +179,7 @@ class WebDriverPool {
}
/**
*
*
* @return
* @throws InterruptedException
*/

@ -1,17 +1,18 @@
package us.codecraft.webmagic.downloader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
@ -29,10 +30,10 @@ public class SeleniumTest {
Map<String, Object> preferences = new HashMap<String, Object>();
preferences.put("profile.default_content_settings", contentSettings);
DesiredCapabilities caps = DesiredCapabilities.chrome();
DesiredCapabilities caps = new DesiredCapabilities();
caps.setCapability("chrome.prefs", preferences);
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
WebDriver webDriver = new ChromeDriver(caps);
WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps));
webDriver.get("http://huaban.com/");
WebElement webElement = webDriver.findElement(By.xpath("/html"));
System.out.println(webElement.getAttribute("outerHTML"));

Loading…
Cancel
Save