Merge branch 'master' of github.com:code4craft/webmagic

Conflicts:
	README.md
	webmagic-samples/pom.xml
	webmagic-selenium/pom.xml
pull/358/head
yihua.huang 11 years ago
commit 7c41bec92f

@ -38,12 +38,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.1</version>
<version>0.4.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.1</version>
<version>0.4.2</version>
</dependency>
#### 项目结构

@ -28,12 +28,12 @@ Add dependencies to your project:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
## Get Started:

@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>

@ -27,12 +27,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
#### 项目结构

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -70,6 +70,7 @@ public class HttpClientDownloader implements Downloader {
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
@ -104,6 +105,7 @@ public class HttpClientDownloader implements Downloader {
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site != null && site.getHttpProxy() != null) {

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -23,14 +23,18 @@ public class AppStore {
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount")
private int userRatingCount;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls",multi = true)
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls")
private List<String> screenshotUrls;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..supportedDevices")
private List<String> supportedDevices;
public static void main(String[] args) {
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
System.out.println(appStore.trackName);
System.out.println(appStore.description);
System.out.println(appStore.userRatingCount);
System.out.println(appStore.screenshotUrls);
System.out.println(appStore.supportedDevices);
}
}

@ -131,7 +131,9 @@ class PageModelExtractor {
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
fieldExtractor = new FieldExtractor(field,
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@ -157,7 +159,7 @@ class PageModelExtractor {
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
comboExtract.notNull(), comboExtract.multi());
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@ -172,7 +174,7 @@ class PageModelExtractor {
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
extractBy.notNull(), extractBy.multi());
extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@ -359,7 +361,7 @@ class PageModelExtractor {
}
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value==null){
if (value == null) {
return;
}
if (fieldExtractor.getSetterMethod() != null) {

@ -75,6 +75,8 @@ public @interface ComboExtract {
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
* @deprecated since 0.4.2
* @return whether the extractor return more than one result
*/
boolean multi() default false;

@ -67,6 +67,8 @@ public @interface ExtractBy {
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
* @deprecated since 0.4.2
* @return whether the extractor return more than one result
*/
boolean multi() default false;

@ -33,6 +33,8 @@ public @interface ExtractByUrl {
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
* @deprecated since 0.4.2
* @return whether the extractor return more than one result
*/
boolean multi() default false;

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.0</version>
<version>0.4.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.1</version>
<version>0.4.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.0</version>
<version>0.4.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -31,6 +31,11 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>

@ -92,7 +92,8 @@ public class ScriptConsole {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404, 500));
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override

@ -34,6 +34,7 @@ public class ScriptEnginePool {
public void release(ScriptEngine scriptEngine){
scriptEngines.add(scriptEngine);
availableCount.incrementAndGet();
}
}

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.1</version>
<version>0.4.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -34,12 +34,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.0</version>
<version>0.4.2</version>
</dependency>
#### 项目结构

Loading…
Cancel
Save