Merge branch 'master' into master

pull/944/head
apaqi 5 years ago committed by GitHub
commit c4c48d3522
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,3 @@
language: java
jdk:
- oraclejdk7
- openjdk9

@ -93,7 +93,7 @@ webmagic还包含两个可用的扩展包因为这两个包都依赖了比较
PageProcessor是webmagic-core的一部分定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码
```java
public class OschinaBlogPageProcesser implements PageProcessor {
public class OschinaBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@ -113,7 +113,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog")
Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
.addPipeline(new ConsolePipeline()).run();
}
}

@ -1,10 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>org.sonatype.oss</groupId>
<artifactId>oss-parent</artifactId>
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.7.3</version>
<modelVersion>4.0.0</modelVersion>
@ -12,8 +7,8 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<spring-version>4.0.0.RELEASE</spring-version>
</properties>
<artifactId>webmagic-parent</artifactId>
<name>webmagic-parent</name>
@ -39,7 +34,7 @@
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
<url>git@github.com:code4craft/webmagic.git</url>
<tag>webmagic-parent-0.6.1</tag>
</scm>
</scm>
<licenses>
<license>
<name>Apache License, Version 2.0</name>
@ -61,7 +56,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<version>4.13</version>
<scope>test</scope>
</dependency>
<dependency>
@ -73,12 +68,17 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
<version>4.5.12</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.13</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<version>29.0-jre</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
@ -88,12 +88,12 @@
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.6</version>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.6</version>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
@ -103,12 +103,12 @@
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.28</version>
<version>1.2.68</version>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId>
<version>0.11.0</version>
<version>1.1.0</version>
<scope>test</scope>
<exclusions>
<exclusion>
@ -125,13 +125,13 @@
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>1.5.0</version>
<version>3.16.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
<version>3.10</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
@ -139,9 +139,19 @@
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.7</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.4.19</version>
</dependency>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>9.2.11.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
@ -149,20 +159,69 @@
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.9.5</version>
<scope>test</scope>
<groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>10.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.0.0-M3</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.0.5</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.18</version>
<version>3.0.0-M4</version>
<configuration>
<forkCount>0</forkCount>
</configuration>
@ -170,11 +229,10 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<version>3.8.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<!--<plugin>-->
@ -200,14 +258,12 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
<version>3.1.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.2.0</version>
<configuration>
<excludes>
<exclude>log4j.xml</exclude>
@ -217,7 +273,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<version>3.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
@ -230,11 +286,15 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
<version>3.2.0</version>
<configuration>
<encoding>UTF-8</encoding>
<doctitle>WebMagic 0.7.3</doctitle>
<locale>en_US</locale>
<!-- avoid the issue: https://bugs.openjdk.java.net/browse/JDK-8212233 -->
<detectJavaApiLink>false</detectJavaApiLink>
</configuration>
<executions>
<execution>
@ -255,7 +315,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.4.1</version>
<version>3.0.0-M1</version>
</plugin>
</plugins>
</build>
@ -310,7 +370,7 @@
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6</version>
<version>1.6.8</version>
<extensions>true</extensions>
<configuration>
<serverId>sonatype-nexus-staging</serverId>

@ -48,6 +48,7 @@
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<optional>true</optional>
</dependency>
<dependency>
@ -66,7 +67,7 @@
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
@ -82,4 +83,4 @@
</dependencies>
</project>
</project>

@ -78,14 +78,15 @@ public class Request implements Serializable {
return this;
}
public Object getExtra(String key) {
@SuppressWarnings("unchecked")
public <T> T getExtra(String key) {
if (extras == null) {
return null;
}
return extras.get(key);
return (T) extras.get(key);
}
public Request putExtra(String key, Object value) {
public <T> Request putExtra(String key, T value) {
if (extras == null) {
extras = new HashMap<String, Object>();
}

@ -1,6 +1,5 @@
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@ -21,6 +20,7 @@ public class ResultItems {
private boolean skip;
@SuppressWarnings("unchecked")
public <T> T get(String key) {
Object o = fields.get(key);
if (o == null) {

@ -203,7 +203,7 @@ public class Site {
/**
* Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* Time unit is milliseconds.<br>
*
* @param sleepTime sleepTime
* @return this
@ -215,7 +215,7 @@ public class Site {
/**
* Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* Time unit is milliseconds.<br>
*
* @return the interval between the processing of two pages,
*/

@ -2,7 +2,6 @@ package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
@ -30,7 +29,6 @@ import java.util.Map;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
@ThreadSafe
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());

@ -1,5 +1,18 @@
package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.lang3.JavaVersion;
import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
@ -9,34 +22,30 @@ import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContexts;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
import us.codecraft.webmagic.Site;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class HttpClientGenerator {
private transient Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
@ -48,41 +57,51 @@ public class HttpClientGenerator {
connectionManager.setDefaultMaxPerRoute(100);
}
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书
} catch (KeyManagementException e) {
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
SSLContext sslContext = createIgnoreVerifySSL();
String[] supportedProtocols;
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
} else {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
}
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("SSLv3");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口用于绕过验证不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("SSLv3");
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
@ -94,7 +113,7 @@ public class HttpClientGenerator {
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());

@ -74,7 +74,7 @@ public class HttpUriRequestConverter {
}
if (proxy != null) {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
HttpUriRequest httpUriRequest = requestBuilder.build();

@ -1,10 +1,8 @@
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
@ -24,6 +22,7 @@ import java.util.Map;
@ThreadSafe
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
/**

@ -1,73 +1,135 @@
package us.codecraft.webmagic.proxy;
/**
*
*/
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.StringUtils;
public class Proxy {
private String host;
private int port;
private String username;
private String password;
private String scheme;
private String host;
private int port;
private String username;
private String password;
public Proxy(String host, int port) {
this.host = host;
this.port = port;
}
public static Proxy create(final URI uri) {
Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
String userInfo = uri.getUserInfo();
if (userInfo != null) {
String[] up = userInfo.split(":");
if (up.length == 1) {
proxy.username = up[0].isEmpty() ? null : up[0];
} else {
proxy.username = up[0].isEmpty() ? null : up[0];
proxy.password = up[1].isEmpty() ? null : up[1];
}
}
return proxy;
}
public Proxy(String host, int port, String username, String password) {
this.host = host;
this.port = port;
this.username = username;
this.password = password;
}
public Proxy(String host, int port) {
this(host, port, null);
}
public Proxy(String host, int port, String scheme) {
this.host = host;
this.port = port;
this.scheme = scheme;
}
public Proxy(String host, int port, String username, String password) {
this.host = host;
this.port = port;
this.username = username;
this.password = password;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
public String getUsername() {
return username;
}
public String getPassword() {
return password;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Proxy proxy = (Proxy) o;
if (port != proxy.port) return false;
if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
return password != null ? password.equals(proxy.password) : proxy.password == null;
}
@Override
public int hashCode() {
int result = host != null ? host.hashCode() : 0;
result = 31 * result + port;
result = 31 * result + (username != null ? username.hashCode() : 0);
result = 31 * result + (password != null ? password.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Proxy{" +
"host='" + host + '\'' +
", port=" + port +
", username='" + username + '\'' +
", password='" + password + '\'' +
'}';
}
return host;
}
public int getPort() {
return port;
}
public String getUsername() {
return username;
}
public String getPassword() {
return password;
}
public URI toURI() {
final StringBuilder userInfoBuffer = new StringBuilder();
if (username != null) {
userInfoBuffer.append(urlencode(username));
}
if (password != null) {
userInfoBuffer.append(":").append(urlencode(password));
}
final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null);
URI uri;
try {
uri = new URI(scheme, userInfo, host, port, null, null, null);
} catch (URISyntaxException e) {
throw new IllegalArgumentException(e.getMessage(), e);
}
return uri;
}
private String urlencode(String s) {
String enc = StandardCharsets.UTF_8.name();
try {
return URLEncoder.encode(s, enc);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Proxy proxy = (Proxy) o;
if (port != proxy.port) return false;
if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false;
if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
return password != null ? password.equals(proxy.password) : proxy.password == null;
}
@Override
public int hashCode() {
int result = host != null ? host.hashCode() : 0;
result = 31 * result + port;
result = 31 * result + (scheme != null ? scheme.hashCode() : 0);
result = 31 * result + (username != null ? username.hashCode() : 0);
result = 31 * result + (password != null ? password.hashCode() : 0);
return result;
}
@Override
public String toString() {
return this.toURI().toString();
}
}

@ -25,5 +25,5 @@ public interface ProxyProvider {
* @return proxy
*/
Proxy getProxy(Task task);
}

@ -59,4 +59,5 @@ public class SimpleProxyProvider implements ProxyProvider {
}
return p % size;
}
}

@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.NumberUtils;
@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue;
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
@ThreadSafe
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;

@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
@ThreadSafe
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();

@ -41,7 +41,7 @@ public class RegexSelector implements Selector {
/**
* Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1.
* @param regexStr
* @param regexStr the regular expression.
*/
public RegexSelector(String regexStr) {
this.compileRegex(regexStr);

@ -0,0 +1,26 @@
package us.codecraft.webmagic.downloader;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* Date: 2017/11/29
* Time: 1:32
*/
public class SSLCompatibilityTest {
@Test
public void test_tls12() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Task task = Site.me().setCycleRetryTimes(5).toTask();
Request request = new Request("https://juejin.im/");
Page page = httpClientDownloader.download(request, task);
assertThat(page.isDownloadSuccess()).isTrue();
}
}

@ -1,45 +1,97 @@
package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
*/
public class ProxyTest {
private static List<String[]> httpProxyList = new ArrayList<String[]>();
@BeforeClass
public static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
for (String line : source) {
httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
}
}
class Fetch extends Thread {
HttpHost hp;
public Fetch(HttpHost hp) {
this.hp = hp;
}
@Override
public void run() {
try {
System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
private static List<String[]> httpProxyList = new ArrayList<String[]>();
@BeforeClass
public static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
for (String line : source) {
httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
}
}
class Fetch extends Thread {
HttpHost hp;
public Fetch(HttpHost hp) {
this.hp = hp;
}
@Override
public void run() {
try {
System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
@Test
public void testCreate() {
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertNull(proxy.getUsername());
assertNull(proxy.getPassword());
assertEquals("127.0.0.1", proxy.getHost());
assertEquals(8080, proxy.getPort());
proxy = Proxy.create(URI.create("http://127.0.0.1:8080"));
assertEquals("http", proxy.getScheme());
assertNull(proxy.getUsername());
assertNull(proxy.getPassword());
assertEquals("127.0.0.1", proxy.getHost());
assertEquals(8080, proxy.getPort());
proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertEquals("username", proxy.getUsername());
assertEquals("password", proxy.getPassword());
assertEquals("127.0.0.1", proxy.getHost());
assertEquals(8080, proxy.getPort());
proxy = Proxy.create(URI.create("//username@127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertEquals("username", proxy.getUsername());
assertNull(proxy.getPassword());
assertEquals("127.0.0.1", proxy.getHost());
assertEquals(8080, proxy.getPort());
proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertNull(proxy.getUsername());
assertEquals("password", proxy.getPassword());
assertEquals("127.0.0.1", proxy.getHost());
assertEquals(8080, proxy.getPort());
}
@Test
public void testToString() {
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString());
assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString());
}
}

@ -0,0 +1,16 @@
package us.codecraft.webmagic.utils;
import org.junit.Assert;
import org.junit.Test;
public class NumberUtilsTest {
@Test
public void testCompareLong() {
Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L));
Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L));
Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L));
Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L));
Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L));
}
}

@ -13,16 +13,14 @@
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
@ -32,4 +30,4 @@
</dependency>
</dependencies>
</project>
</project>

@ -1,6 +1,5 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
@ -16,7 +15,6 @@ import java.io.*;
* @author dolphineor@gmail.com
* @version 0.5.3
*/
@ThreadSafe
public class PhantomJSDownloader extends AbstractDownloader {
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);

@ -1,6 +1,8 @@
package us.codecraft.webmagic.example;
import org.apache.log4j.Logger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.handler.CompositePageProcessor;
import us.codecraft.webmagic.handler.CompositePipeline;
@ -15,7 +17,7 @@ import us.codecraft.webmagic.handler.RequestMatcher;
*/
public class PatternProcessorExample {
private static Logger log = Logger.getLogger(PatternProcessorExample.class);
private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class);
public static void main(String... args) {

@ -1,12 +1,13 @@
package us.codecraft.webmagic.scheduler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import java.io.*;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
@ -17,6 +18,13 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
@ -141,7 +149,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
urls.add(line.trim());
lineReaded++;
if (lineReaded > cursor.get()) {
queue.add(new Request(line));
queue.add(deserializeRequest(line));
}
}
} finally {
@ -183,7 +191,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
init(task);
}
queue.add(request);
fileUrlWriter.println(request.getUrl());
fileUrlWriter.println(serializeRequest(request));
}
@Override
@ -204,4 +212,13 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
protected String serializeRequest(Request request) {
return request.getUrl();
}
protected Request deserializeRequest(String line) {
return new Request(line);
}
}

@ -2,6 +2,7 @@ package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@ -60,14 +61,41 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
Jedis jedis = pool.getResource();
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (request.getExtras() != null) {
if (checkForAdditionalInfo(request)) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
} finally {
pool.returnResource(jedis);
jedis.close();
}
}
private boolean checkForAdditionalInfo(Request request) {
if (request == null) {
return false;
}
if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) {
return true;
}
if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) {
return true;
}
if (request.isBinaryContent() || request.getRequestBody() != null) {
return true;
}
if (request.getExtras() != null && !request.getExtras().isEmpty()) {
return true;
}
if (request.getPriority() != 0L) {
return true;
}
return false;
}
@Override
@ -85,7 +113,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
Request o = JSON.parseObject(new String(bytes), Request.class);
return o;
}
Request request = new Request(url);
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
@ -100,8 +128,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
return QUEUE_PREFIX + task.getUUID();
}
protected String getItemKey(Task task)
{
protected String getItemKey(Task task) {
return ITEM_PREFIX + task.getUUID();
}

@ -11,12 +11,12 @@
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>

@ -11,19 +11,17 @@
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
@ -34,7 +32,9 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
<configuration>
<skip>true</skip>
</configuration>

@ -2,11 +2,12 @@ package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import org.apache.log4j.Logger;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@ -40,7 +41,7 @@ public class Xpath2Selector implements Selector {
private XPathExpression xPathExpression;
private Logger logger = Logger.getLogger(getClass());
private Logger logger = LoggerFactory.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;

@ -7,7 +7,6 @@
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-scripts</artifactId>
<properties>
<kotlin.version>1.1.2-2</kotlin.version>
@ -17,27 +16,23 @@
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-stdlib</artifactId>
<version>${kotlin.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.1.6</version>
</dependency>
<dependency><groupId>org.python</groupId>
<dependency>
<groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>2.5.3</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
@ -45,12 +40,16 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
@ -59,21 +58,6 @@
<build>
<sourceDirectory>${project.basedir}/src/main/java</sourceDirectory>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>

@ -13,21 +13,16 @@
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.41.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<groupId>${project.groupId}</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
@ -37,7 +32,9 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0-M1</version>
<configuration>
<skip>true</skip>
</configuration>

@ -1,10 +1,12 @@
package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@ -29,7 +31,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = Logger.getLogger(getClass());
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;

@ -1,6 +1,5 @@
package us.codecraft.webmagic.downloader.selenium;
import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
@ -8,6 +7,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
@ -27,7 +28,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Time: 1:41 <br>
*/
class WebDriverPool {
private Logger logger = Logger.getLogger(getClass());
private Logger logger = LoggerFactory.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;

Loading…
Cancel
Save