diff --git a/README.md b/README.md index 967b9f67..341e0961 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.4.1 + 0.4.2 us.codecraft webmagic-extension - 0.4.1 + 0.4.2 #### 项目结构 diff --git a/en_docs/README.md b/en_docs/README.md index 82b82a81..684da90d 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -28,12 +28,12 @@ Add dependencies to your project: us.codecraft webmagic-core - 0.4.0 + 0.4.2 us.codecraft webmagic-extension - 0.4.0 + 0.4.2 ## Get Started: diff --git a/pom.xml b/pom.xml index a302728e..3d488ae5 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.4.2-SNAPSHOT + 0.4.3-SNAPSHOT 4.0.0 pom diff --git a/user-manual.md b/user-manual.md index acb955ee..f225c8a7 100644 --- a/user-manual.md +++ b/user-manual.md @@ -27,12 +27,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.4.0 + 0.4.2 us.codecraft webmagic-extension - 0.4.0 + 0.4.2 #### 项目结构 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index c419ea70..914bfdab 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.2-SNAPSHOT + 0.4.3-SNAPSHOT 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index b6baaa7e..da34c2de 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -70,6 +70,7 @@ public class HttpClientDownloader implements Downloader { CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { + httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); @@ -104,6 +105,7 @@ public class HttpClientDownloader implements Downloader { } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()) + .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); if (site != null && site.getHttpProxy() != null) { diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 51b3924e..c6af14f6 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.4.2-SNAPSHOT + 0.4.3-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java index fcc937b5..d0d056f8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java @@ -23,14 +23,18 @@ public class AppStore { @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount") private int userRatingCount; - @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls",multi = true) + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls") private List screenshotUrls; + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..supportedDevices") + private List supportedDevices; + public static void main(String[] args) { AppStore appStore = OOSpider.create(Site.me(), AppStore.class).get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); System.out.println(appStore.trackName); System.out.println(appStore.description); System.out.println(appStore.userRatingCount); System.out.println(appStore.screenshotUrls); + System.out.println(appStore.supportedDevices); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index d7da0c9d..62b6de08 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -131,7 +131,9 @@ class PageModelExtractor { if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + fieldExtractor = new FieldExtractor(field, + new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -157,7 +159,7 @@ class PageModelExtractor { selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, - comboExtract.notNull(), comboExtract.multi()); + comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -172,7 +174,7 @@ class PageModelExtractor { if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, - extractBy.notNull(), extractBy.multi()); + extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -359,7 +361,7 @@ class PageModelExtractor { } private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value==null){ + if (value == null) { return; } if (fieldExtractor.getSetterMethod() != null) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java index 5268a254..6d2ce6cd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java @@ -75,6 +75,8 @@ public @interface ComboExtract { * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* + * Deprecated since 0.4.2. This option is determined automatically by the class of field. + * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 8fddccf8..2e23aa00 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -67,6 +67,8 @@ public @interface ExtractBy { * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* + * Deprecated since 0.4.2. This option is determined automatically by the class of field. + * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index 328c0795..6c778629 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -33,6 +33,8 @@ public @interface ExtractByUrl { * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* + * Deprecated since 0.4.2. This option is determined automatically by the class of field. + * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml index 3dcf2b62..223942a9 100644 --- a/webmagic-lucene/pom.xml +++ b/webmagic-lucene/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.0 + 0.4.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8c6b87e2..7b86ba2f 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.1 + 0.4.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index a8841df7..225d1555 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.0 + 0.4.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 29fe1f79..1c65513d 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.4.2-SNAPSHOT + 0.4.3-SNAPSHOT 4.0.0 @@ -31,6 +31,11 @@ webmagic-core ${project.version} + + us.codecraft + webmagic-extension + ${project.version} + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 7d3b6365..57a923ef 100644 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -92,7 +92,8 @@ public class ScriptConsole { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); - pageProcessor.getSite().setAcceptStatCode(Sets.newHashSet(200, 404, 500)); + pageProcessor.getSite().setRetryTimes(3); + pageProcessor.getSite().setAcceptStatCode(Sets.newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index 9dc74133..d1e5d7fe 100644 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -34,6 +34,7 @@ public class ScriptEnginePool { public void release(ScriptEngine scriptEngine){ scriptEngines.add(scriptEngine); + availableCount.incrementAndGet(); } } diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 842d5e16..3026a369 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.4.1 + 0.4.2 4.0.0 diff --git a/zh_docs/README.md b/zh_docs/README.md index e6961d8e..c58469a9 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -34,12 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.4.0 + 0.4.2 us.codecraft webmagic-extension - 0.4.0 + 0.4.2 #### 项目结构