diff --git a/.gitignore b/.gitignore index 0175dbaa..3a839a5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,77 @@ -target -*.iml -out/ -.idea -.classpath +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties +# https://github.com/takari/maven-wrapper#usage-without-binary-jar +.mvn/wrapper/maven-wrapper.jar + +# Eclipse m2e generated files +# Eclipse Core .project -.settings/ +# JDT-specific (Eclipse Java Development Tools) +.classpath +.metadata bin/ -.myeclipse +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project diff --git a/pom.xml b/pom.xml index df314b7c..b96c9a82 100644 --- a/pom.xml +++ b/pom.xml @@ -1,14 +1,24 @@ - - us.codecraft - 0.10.3 + 4.0.0 + + org.oxerr + oxerr-parent + 2.2.1 + + us.codecraft + 1.0.0 pom UTF-8 UTF-8 - 1.8 - 1.8 + 11 + 11 3.23.1 1.5.0 4.4 @@ -23,20 +33,21 @@ 3.7.1 9.3.9.0 2.9.0 - 4.13.2 + 5.10.2 + 1.10.2 2.7.3 - 1.2.17 + 2.23.1 2.0.2-beta 1.3.0 1.2.0 - 11.4 - 3.141.59 + 12.4 + 4.14.1 2.0.4 4.0.0.RELEASE 0.3.5 - webmagic-parent - webmagic-parent + webmagic + webmagic A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. @@ -77,14 +88,41 @@ webmagic-coverage + + + org.apache.logging.log4j + log4j-core + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.platform + junit-platform-launcher + test + + + org.junit.platform + junit-platform-runner + test + + + - - junit - junit - ${junit.version} - test - org.mockito mockito-all @@ -101,6 +139,16 @@ httpcore ${httpcore.version} + + org.apache.logging.log4j + log4j-core + ${log4j2.version} + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j2.version} + com.google.guava guava @@ -112,13 +160,28 @@ ${json-path.version} - org.slf4j - slf4j-api - ${slf4j.version} + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + + + org.junit.vintage + junit-vintage-engine + ${junit.version} + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + + + org.junit.platform + junit-platform-runner + ${junit.platform.version} org.slf4j - slf4j-log4j12 + slf4j-api ${slf4j.version} @@ -143,11 +206,6 @@ - - log4j - log4j - ${log4j.version} - org.assertj assertj-core @@ -219,86 +277,10 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - 3.1.0 - - - enforce-maven - - enforce - - - - - 3.5.0 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - - - - - - - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - log4j.xml - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - attach-sources - - jar - - - - org.apache.maven.plugins maven-javadoc-plugin - 3.4.1 - UTF-8 WebMagic ${project.version} en_US @@ -322,11 +304,6 @@ - - org.apache.maven.plugins - maven-release-plugin - 3.0.0-M6 - org.jacoco jacoco-maven-plugin @@ -355,189 +332,6 @@ - - - - org.apache.maven.plugins - maven-clean-plugin - 3.2.0 - - - org.apache.maven.plugins - maven-compiler-plugin - 3.10.1 - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0 - - - org.apache.maven.plugins - maven-install-plugin - 3.0.1 - - - org.apache.maven.plugins - maven-jar-plugin - 3.3.0 - - - org.apache.maven.plugins - maven-jxr-plugin - 3.3.0 - - - org.apache.maven.plugins - maven-pmd-plugin - 3.19.0 - - - org.apache.maven.plugins - maven-resources-plugin - 3.3.0 - - - org.apache.maven.plugins - maven-site-plugin - 4.0.0-M3 - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M7 - - - org.apache.maven.plugins - maven-surefire-report-plugin - 3.0.0-M7 - - - org.codehaus.mojo - taglist-maven-plugin - 3.0.0 - - - org.jacoco - jacoco-maven-plugin - 0.8.8 - - - com.amashchenko.maven.plugin - gitflow-maven-plugin - 1.18.0 - - - com.github.spotbugs - spotbugs-maven-plugin - 4.7.2.0 - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - none - - - - org.apache.maven.plugins - maven-jxr-plugin - - - org.apache.maven.plugins - maven-pmd-plugin - - - org.apache.maven.plugins - maven-surefire-report-plugin - - - org.codehaus.mojo - taglist-maven-plugin - - - com.github.spotbugs - spotbugs-maven-plugin - - - - - - - release - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - package - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.4.1 - - - package - - jar - - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 3.0.1 - - - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.13 - true - - sonatype-nexus-staging - https://oss.sonatype.org/ - true - - - - - - - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 20a94279..6e1d3c89 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,9 +1,14 @@ - + us.codecraft - webmagic-parent - 0.10.3 + webmagic + 1.0.0 4.0.0 @@ -15,11 +20,6 @@ httpclient - - junit - junit - - org.apache.commons commons-lang3 @@ -45,12 +45,6 @@ mockito-all - - org.slf4j - slf4j-log4j12 - true - - org.apache.commons commons-collections4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 17f8b03d..b4c161a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -71,6 +71,7 @@ public class Page { * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, * and {@link #request} is specified. * + * @param request the {@link Request}. * @return the page. * @since 0.10.0 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc28619..a59b2063 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,13 +1,14 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.downloader.Downloader; -import us.codecraft.webmagic.model.HttpRequestBody; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.model.HttpRequestBody; +import us.codecraft.webmagic.utils.Experimental; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -35,7 +36,7 @@ public class Request implements Serializable { /** * Store additional information in extras. */ - private Map extras; + private Map extras = new HashMap<>(); /** * cookies for current url, if not set use Site's cookies @@ -93,9 +94,6 @@ public class Request implements Serializable { } public Request putExtra(String key, T value) { - if (extras == null) { - extras = new HashMap(); - } extras.put(key, value); return this; } @@ -105,11 +103,11 @@ public class Request implements Serializable { } public Map getExtras() { - return extras; + return Collections.unmodifiableMap(extras); } public Request setExtras(Map extras) { - this.extras = extras; + this.extras.putAll(extras); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9f9201ee..a35af70a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,11 +9,8 @@ import java.util.Date; import java.util.List; import java.util.UUID; import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; @@ -75,9 +72,9 @@ public class Spider implements Runnable, Task { protected Site site; protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - + + protected SpiderScheduler scheduler; + protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; @@ -88,7 +85,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = true; + protected volatile boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -100,10 +97,6 @@ public class Spider implements Runnable, Task { protected boolean destroyWhenExit = true; - private ReentrantLock newUrlLock = new ReentrantLock(); - - private Condition newUrlCondition = newUrlLock.newCondition(); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); @@ -131,6 +124,7 @@ public class Spider implements Runnable, Task { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** @@ -186,15 +180,15 @@ public class Spider implements Runnable, Task { /** * set scheduler for Spider * - * @param scheduler scheduler + * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ - public Spider setScheduler(Scheduler scheduler) { + public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - Scheduler oldScheduler = this.scheduler; - this.scheduler = scheduler; + SpiderScheduler oldScheduler = this.scheduler; + scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { @@ -213,7 +207,7 @@ public class Spider implements Runnable, Task { * @deprecated */ @Deprecated - public Spider pipeline(Pipeline pipeline) { + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -264,7 +258,7 @@ public class Spider implements Runnable, Task { * @deprecated */ @Deprecated - public Spider downloader(Downloader downloader) { + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -333,10 +327,10 @@ public class Spider implements Runnable, Task { } } else { // wait until new url added, - if (waitNewUrl()) { - //if interrupted + if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { + // if interrupted break; - } + } continue; } } @@ -353,7 +347,7 @@ public class Spider implements Runnable, Task { logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); - signalNewUrl(); + scheduler.signalNewUrl(); } } }); @@ -536,7 +530,7 @@ public class Spider implements Runnable, Task { for (String url : urls) { addRequest(new Request(url)); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } @@ -588,42 +582,10 @@ public class Spider implements Runnable, Task { for (Request request : requests) { addRequest(request); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } - /** - * - * @return isInterrupted - */ - private boolean waitNewUrl() { - // now there may not be any thread live - newUrlLock.lock(); - try { - //double check,unnecessary, unless very fast concurrent - if (threadPool.getThreadAlive() == 0) { - return false; - } - //wait for amount of time - newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); - return false; - } catch (InterruptedException e) { - // logger.warn("waitNewUrl - interrupted, error {}", e); - return true; - } finally { - newUrlLock.unlock(); - } - } - - private void signalNewUrl() { - try { - newUrlLock.lock(); - newUrlCondition.signalAll(); - } finally { - newUrlLock.unlock(); - } - } - public void start() { runAsync(); } @@ -636,6 +598,13 @@ public class Spider implements Runnable, Task { } } + /** + * Stop when all tasks in the queue are completed and all worker threads are also completed + */ + public void stopWhenComplete(){ + this.exitWhenComplete = true; + } + /** * start with more than one threads * @@ -799,7 +768,7 @@ public class Spider implements Runnable, Task { } public Scheduler getScheduler() { - return scheduler; + return scheduler.getScheduler(); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java new file mode 100644 index 00000000..1005bac8 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.thread.CountableThreadPool; + +public class SpiderScheduler { + private Scheduler scheduler; + private final ReentrantLock newUrlLock = new ReentrantLock(); + private final Condition newUrlCondition = newUrlLock.newCondition(); + + public SpiderScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Scheduler getScheduler() { + return scheduler; + } + + public void setScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Request poll(Spider spider) { + return scheduler.poll(spider); + } + + public void push(Request request, Spider spider) { + scheduler.push(request, spider); + } + + public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { + newUrlLock.lock(); + try { + if (threadPool.getThreadAlive() == 0) { + return false; + } + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; + } catch (InterruptedException e) { + return true; + } finally { + newUrlLock.unlock(); + } + } + + public void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 4baaf4a4..16846786 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -42,7 +42,9 @@ public class HttpUriRequestConverter { HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); - authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY); + UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()); + authState.update(proxyAuthScheme, proxyCredentials); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index c063b482..85ff5fa6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable { return elements; } - @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c78f6791..18258e9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } - @Override - public Selectable smartContent() { - throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); - } - @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9412cfce..a4d5fdb9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -51,14 +51,6 @@ public interface Selectable { * @return new Selectable after extract */ public Selectable css(String selector, String attrName); - - /** - * select smart content with ReadAbility algorithm - * - * @return content - */ - public Selectable smartContent(); - /** * select all links * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a46..63bb4c11 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -21,6 +21,10 @@ public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + private CharsetUtils() { + throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!"); + } + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index c61483a3..ea317c40 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -116,6 +116,10 @@ public class UrlUtils { private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { + if (contentType == null) { + return null; + } + Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f5..00000000 --- a/webmagic-core/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java index c7e4943d..b8f699a6 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Map; + import org.junit.Test; -import us.codecraft.webmagic.utils.HttpConstant; -import static org.assertj.core.api.Assertions.assertThat; +import us.codecraft.webmagic.utils.HttpConstant; /** * @author code4crafter@gmail.com @@ -22,4 +26,28 @@ public class RequestTest { assertThat(requestA).isNotEqualTo(requestB); assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); } + + @Test + public void testSetExtras() { + Request request = new Request(); + Map extras = Collections.singletonMap("a", "1"); + request.setExtras(extras); + request.putExtra("b", "2"); + assertThat(request.getExtra("a")).isEqualTo("1"); + assertThat(request.getExtra("b")).isEqualTo("2"); + } + + @Test + public void testGetExtras() { + Request request = new Request(); + request.putExtra("a", "1"); + assertThat(request.getExtras()).containsEntry("a", "1"); + } + + @Test(expected = UnsupportedOperationException.class) + public void testGetExtrasShouldBeUnmodifiable() { + Request request = new Request(); + request.getExtras().put("a", "1"); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java index 783b82dd..47c4fcc1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.junit.Test; @@ -14,4 +18,23 @@ public class SiteTest { assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); } + @Test + public void addCookieTest(){ + Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + site.addCookie("cookieDefault","cookie-webmagicDefault"); + String firstDomain="example.com"; + String secondDomain="exampleCopy.com"; + site.addCookie(firstDomain, "cookie", "cookie-webmagic"); + site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy"); + site.addCookie(secondDomain, "cookie", "cookie-webmagic"); + Map> allCookies = site.getAllCookies(); + List domains=new ArrayList<>(); + for(String key : allCookies.keySet()){ + domains.add(key); + } + assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie")); + assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy")); + assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie")); + assertEquals(2, domains.size()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 780ca752..1ff7b4dd 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.uri; import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -333,5 +334,13 @@ public class HttpClientDownloaderTest { }); } + @Test + public void test_no_task_download(){ + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null)); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 8e4c8202..61fc6ab8 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -8,19 +8,19 @@ import java.util.ArrayList; import java.util.List; import org.apache.http.HttpHost; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; /** * @author yxssfxwzy@sina.com May 30, 2014 - * + * */ -public class ProxyTest { +class ProxyTest { private static List httpProxyList = new ArrayList(); - @BeforeClass - public static void before() { + @BeforeAll + static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; @@ -48,7 +48,7 @@ public class ProxyTest { } @Test - public void testCreate() { + void testCreate() { Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); @@ -86,7 +86,15 @@ public class ProxyTest { } @Test - public void testToString() { + void testEqualsHashCode() { + var proxy0 = new Proxy("::1", 1080); + var proxy1 = new Proxy("::1", 1080); + assertEquals(proxy0, proxy1); + assertEquals(proxy0.hashCode(), proxy1.hashCode()); + } + + @Test + void testToString() { assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java new file mode 100644 index 00000000..59885ebd --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class AndSelectorTest { + + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item1']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals("
\n Item 1\n
", result.get(0)); + } + + @Test + public void testSelectList_NoResults() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals(0, result.size()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java new file mode 100644 index 00000000..8b1ace90 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; + +import java.util.List; +import static org.junit.Assert.*; + +public class CssSelectorTest { + + @Test + public void testSelectElement() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + Element resultElement = cssSelector.selectElement(dummyElement); + assertNotNull(resultElement); + } + + @Test + public void testSelectList() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + List result = cssSelector.selectList(dummyElement); + assertEquals(1, result.size()); + assertEquals("[
\n Hello World!\n
]", result.toString()); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java new file mode 100644 index 00000000..24d87647 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class OrSelectorTest { + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + String expectedResult = "[\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + ",
\n" + + " Item 1\n" + + "
,
\n" + + " Item 2\n" + + "
]"; + List selectors = new ArrayList(); + selectors.add(new CssSelector("head")); + selectors.add(new XpathSelector("//div[@class='item1']")); + selectors.add(new XpathSelector("//div[@class='item2']")); + OrSelector orSelector = new OrSelector(selectors); + List result = orSelector.selectList(htmlContent); + assertEquals(expectedResult, result.toString()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java new file mode 100644 index 00000000..987a6f77 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.IOException; + +import org.junit.jupiter.api.Test; + +class CharsetUtilsTest { + + @Test + void testDetectCharset() throws IOException { + assertNull(CharsetUtils.detectCharset(null, new byte[0])); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 6afdeefe..38c8295b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import static org.junit.Assert.assertNull; + import org.junit.Assert; import org.junit.Test; @@ -43,5 +45,9 @@ public class UrlUtilsTest { Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); } + @Test + public void testGetCharset() { + assertNull(UrlUtils.getCharset(null)); + } } diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f5..00000000 --- a/webmagic-core/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j2-test.xml b/webmagic-core/src/test/resources/log4j2-test.xml new file mode 100644 index 00000000..86aee5f5 --- /dev/null +++ b/webmagic-core/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 46b66f32..19cdc33d 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -1,14 +1,16 @@ - 4.0.0 us.codecraft - webmagic-parent - 0.10.3 + webmagic + 1.0.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index dd72ccbd..15f94cf5 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -1,15 +1,26 @@ - + us.codecraft - webmagic-parent - 0.10.3 + webmagic + 1.0.0 4.0.0 webmagic-extension + + org.projectlombok + lombok + 1.18.32 + provided + redis.clients jedis @@ -29,10 +40,6 @@ webmagic-core ${project.version} - - junit - junit - diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index f1d2f84d..67344758 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,5 +1,9 @@ package us.codecraft.webmagic.model; +import lombok.Getter; +import lombok.Setter; + +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** @@ -7,18 +11,18 @@ import us.codecraft.webmagic.selector.Selector; * @author code4crafter@gmail.com
* @since 0.2.0 */ -class Extractor { +public class Extractor { + @Getter @Setter protected Selector selector; + @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; - - static enum Source {Html, Url, RawHtml, RawText} - + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; @@ -26,23 +30,11 @@ class Extractor { this.multi = multi; } - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - boolean isNotNull() { + public boolean isNotNull() { return notNull; } - boolean isMulti() { + public boolean isMulti() { return multi; } - - void setSelector(Selector selector) { - this.selector = selector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba133..d4cb5937 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,58 +1,33 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; import java.lang.reflect.Method; +import lombok.Getter; +import lombok.Setter; + /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ -class FieldExtractor extends Extractor { +public class FieldExtractor extends Extractor { + @Getter private final Field field; + @Getter @Setter private Method setterMethod; + @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } - - Field getField() { - return field; - } - - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - void setSetterMethod(Method setterMethod) { - this.setterMethod = setterMethod; - } - - Method getSetterMethod() { - return setterMethod; - } - - boolean isNotNull() { - return notNull; - } - - ObjectFormatter getObjectFormatter() { - return objectFormatter; - } - - void setObjectFormatter(ObjectFormatter objectFormatter) { - this.objectFormatter = objectFormatter; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 1e25a46c..751aafe7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -3,17 +3,21 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; +import us.codecraft.webmagic.model.sources.Source; +import us.codecraft.webmagic.model.sources.SourceTextExtractor; +import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -29,14 +33,19 @@ import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText; */ class PageModelExtractor { + @Getter private List targetUrlPatterns = new ArrayList(); + @Getter private Selector targetUrlRegionSelector; + @Getter private List helpUrlPatterns = new ArrayList(); + @Getter private Selector helpUrlRegionSelector; + @Getter private Class clazz; private List fieldExtractors; @@ -86,7 +95,7 @@ class PageModelExtractor { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, - new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -112,7 +121,7 @@ class PageModelExtractor { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -127,26 +136,23 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - ExtractBy.Source source0 = extractBy.source(); - if (extractBy.type()== ExtractBy.Type.JsonPath){ - source0 = RawText; - } - FieldExtractor.Source source = null; - switch (source0){ + ExtractBy.Source extractSource = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath) + extractSource = RawText; + Source source = null; + switch (extractSource) { case RawText: - source = FieldExtractor.Source.RawText; + source = new RawText(); break; case RawHtml: - source = FieldExtractor.Source.RawHtml; + source = new RawHtml(); break; case SelectedHtml: - source =FieldExtractor.Source.Html; + source = new SelectedHtml(); break; default: - source =FieldExtractor.Source.Html; - + source = new SelectedHtml(); } - fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); @@ -193,7 +199,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } @@ -233,135 +239,15 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.isMulti()) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } - if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - List converted = convert(value, fieldExtractor.getObjectFormatter()); - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } else { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } - if (value == null && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convert(value, fieldExtractor.getObjectFormatter()); - if (converted == null && fieldExtractor.isNotNull()) { - return null; - } - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } + PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); + if (!field.operation(o, fieldExtractor, logger)) + return null; } - if (AfterExtractor.class.isAssignableFrom(clazz)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); - } - } catch (InstantiationException e) { - logger.error("extract fail", e); - } catch (IllegalAccessException e) { - logger.error("extract fail", e); - } catch (InvocationTargetException e) { + } catch (Exception e) { logger.error("extract fail", e); } return o; } - - private Object convert(String value, ObjectFormatter objectFormatter) { - try { - Object format = objectFormatter.format(value); - logger.debug("String {} is converted to {}", value, format); - return format; - } catch (Exception e) { - logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); - } - return null; - } - - private List convert(List values, ObjectFormatter objectFormatter) { - List objects = new ArrayList(); - for (String value : values) { - Object converted = convert(value, objectFormatter); - if (converted != null) { - objects.add(converted); - } - } - return objects; - } - - private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value == null) { - return; - } - if (fieldExtractor.getSetterMethod() != null) { - fieldExtractor.getSetterMethod().invoke(o, value); - } - fieldExtractor.getField().set(o, value); - } - - Class getClazz() { - return clazz; - } - - List getTargetUrlPatterns() { - return targetUrlPatterns; - } - - List getHelpUrlPatterns() { - return helpUrlPatterns; - } - - Selector getTargetUrlRegionSelector() { - return targetUrlRegionSelector; - } - - Selector getHelpUrlRegionSelector() { - return helpUrlRegionSelector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java new file mode 100644 index 00000000..4a4bf38a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public class MultipleField extends PageField { + @Getter + private List fieldNames; + + public MultipleField(List fieldNames) { + this.fieldNames = fieldNames; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) + return false; + if (fieldExtractor.getObjectFormatter() != null) { + List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); + setField(o, fieldExtractor, converted); + } + else + setField(o, fieldExtractor, this.fieldNames); + return true; + } + + private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { + List objects = new ArrayList<>(); + for (String value : values) { + Object converted = this.convert(value, objectFormatter, logger); + if (converted != null) + objects.add(converted); + } + return objects; + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java new file mode 100644 index 00000000..ad442833 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public abstract class PageField { + public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; + + protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { + try { + Object format = objectFormatter.format(value); + logger.debug("String {} is converted to {}", value, format); + return format; + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value != null) { + if (fieldExtractor.getSetterMethod() != null) + fieldExtractor.getSetterMethod().invoke(o, value); + fieldExtractor.getField().set(o, value); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java new file mode 100644 index 00000000..136a1c56 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; + +public class SingleField extends PageField { + @Getter + private String fieldName; + + public SingleField(String fieldName) { + this.fieldName = fieldName; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); + if (converted == null && fieldExtractor.isNotNull()) + return false; + setField(o, fieldExtractor, converted); + } else + setField(o, fieldExtractor, this.fieldName); + return true; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java new file mode 100644 index 00000000..f03b8864 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.model.formatter; + +public interface BasicClassDetector { + Class detectBasicClass(Class type); +} + +class IntegerClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } + return null; + } +} + +class LongClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } + return null; + } +} + +class DoubleClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } + return null; + } +} + +class FloatClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } + return null; + } +} + +class ShortClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } + return null; + } +} + +class CharacterClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } + return null; + } +} + +class ByteClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } + return null; + } +} + +class BooleanClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return null; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a84..2d4d85b0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { } protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); + public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), + new LongClassDetector(), + new FloatClassDetector(), + new DoubleClassDetector(), + new ShortClassDetector(), + new ByteClassDetector(), + new BooleanClassDetector(), + new CharacterClassDetector()); public static Class detectBasicClass(Class type) { - if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { - return Integer.class; - } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { - return Long.class; - } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { - return Double.class; - } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { - return Float.class; - } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { - return Short.class; - } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { - return Character.class; - } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { - return Byte.class; - } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { - return Boolean.class; + for (BasicClassDetector detector : basicClassDetector) { + Class detectedClass = detector.detectBasicClass(type); + if (detectedClass != null) { + return detectedClass; + } } return type; } @@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { } } - } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java new file mode 100644 index 00000000..14682722 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.model.sources; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; + +public interface Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + + public class RawHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } + } + + public class SelectedHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().selectList(html); + } + } + + public class Url implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getUrl().toString()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getUrl().toString()); + } + } + + public class RawText implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getRawText()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getRawText()); + } + } + + public class DefaultSource implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(html); + } + } +} + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java new file mode 100644 index 00000000..1e572695 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.model.sources; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; +import us.codecraft.webmagic.model.fields.PageField; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SourceTextExtractor { + public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + Source source = fieldExtractor.getSource(); + if (fieldExtractor.isMulti()) + return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); + else + return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 46d47e5a..7abe5bfa 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -102,7 +102,7 @@ public class RedisPriorityScheduler extends RedisScheduler { } private void setExtrasInItem(Jedis jedis,Request request, Task task) { - if (request.getExtras() != null) { + if (!request.getExtras().isEmpty()) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 19e83132..8d61bea3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -84,7 +84,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor return true; } - if (request.getExtras() != null && !request.getExtras().isEmpty()) { + if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/resources/log4j.xml b/webmagic-extension/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f5..00000000 --- a/webmagic-extension/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java index 63c40d29..c2081dbf 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -13,7 +13,6 @@ import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class ConfigurablePageProcessorTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 627fa6e8..1014a45f 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -12,7 +12,6 @@ import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com - * @date 14-4-4 */ public class ModelPageProcessorTest { diff --git a/webmagic-extension/src/test/resources/log4j.xml b/webmagic-extension/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f5..00000000 --- a/webmagic-extension/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml new file mode 100644 index 00000000..86aee5f5 --- /dev/null +++ b/webmagic-extension/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3f191c7b..92116136 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,9 +1,14 @@ - + - webmagic-parent us.codecraft - 0.10.3 + webmagic + 1.0.0 4.0.0 @@ -20,10 +25,6 @@ webmagic-extension ${project.version} - - junit - junit - org.mapdb mapdb @@ -42,7 +43,7 @@ com.fasterxml.jackson.core jackson-databind - 2.15.2 + 2.16.0 diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml deleted file mode 100644 index a6630f81..00000000 --- a/webmagic-samples/src/main/resources/log4j.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml new file mode 100644 index 00000000..f3bad53d --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j2.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index cbd1621c..2530bd81 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,14 +1,23 @@ - + - webmagic-parent us.codecraft - 0.10.3 + webmagic + 1.0.0 4.0.0 webmagic-saxon + + true + + ${project.groupId} @@ -23,23 +32,6 @@ net.sf.saxon Saxon-HE - - junit - junit - - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - - diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 568c8d0b..3c03aaf8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,9 +1,14 @@ - + - webmagic-parent us.codecraft - 0.10.3 + webmagic + 1.0.0 4.0.0 @@ -13,6 +18,14 @@ + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + org.jruby jruby @@ -30,25 +43,22 @@ commons-cli commons-cli - - junit - junit - test - ${project.groupId} webmagic-core ${project.version} - - org.slf4j - slf4j-log4j12 - ${project.groupId} webmagic-extension ${project.version} + + org.projectlombok + lombok + 1.18.32 + provided + @@ -90,4 +100,4 @@ - \ No newline at end of file + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java new file mode 100644 index 00000000..873176e6 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.scripts; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import lombok.Getter; +import lombok.Setter; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Language; +import us.codecraft.webmagic.utils.WMCollections; + +public class Params { + @Getter + Language language = new Javascript(); + + @Getter @Setter + String scriptFileName; + + @Getter @Setter + List urls; + + @Getter @Setter + int thread = 1; + + @Getter @Setter + int sleepTime = 1000; + + private static Map> alias; + + public Params() { + alias = new HashMap>(); + alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 0423e58e..c60b3ec3 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,88 +1,21 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Set; /** - * @author code4crafter@gmail.com + * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { - - private static class Params { - Language language = Language.JavaScript; - String scriptFileName; - List urls; - int thread = 1; - int sleepTime = 1000; - private static Map> alias = new HashMap>(); - - static { - alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); - } - - public void setLanguagefromArg(String arg) { - for (Map.Entry> languageSetEntry : alias.entrySet()) { - if (languageSetEntry.getValue().contains(arg)) { - this.language = languageSetEntry.getKey(); - return; - } - } - } - - private Language getLanguage() { - return language; - } - - private void setLanguage(Language language) { - this.language = language; - } - - private String getScriptFileName() { - return scriptFileName; - } - - private void setScriptFileName(String scriptFileName) { - this.scriptFileName = scriptFileName; - } - - private List getUrls() { - return urls; - } - - private void setUrls(List urls) { - this.urls = urls; - } - - private int getThread() { - return thread; - } - - private void setThread(int thread) { - this.thread = thread; - } - - private int getSleepTime() { - return sleepTime; - } - - private void setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - } - } - public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); @@ -140,45 +73,9 @@ public class ScriptConsole { private static Params readOptions(CommandLine commandLine) { Params params = new Params(); - if (commandLine.hasOption("l")) { - String language = commandLine.getOptionValue("l"); - params.setLanguagefromArg(language); - } - if (commandLine.hasOption("f")) { - String scriptFilename = commandLine.getOptionValue("f"); - params.setScriptFileName(scriptFilename); - } else { - exit(); - } - if (commandLine.hasOption("s")) { - Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); - params.setSleepTime(sleepTime); - } - if (commandLine.hasOption("t")) { - Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); - params.setThread(thread); - } - if (commandLine.hasOption("g")) { - configLogger(commandLine.getOptionValue("g")); - } - params.setUrls(commandLine.getArgList()); + List options = CommandLineOption.getAllOptions(); + for (CommandLineOption option : options) + option.addParamOptionIfInCommandLine(params, commandLine); return params; } - - private static void configLogger(String value) { - Logger rootLogger = Logger.getRootLogger(); - if ("debug".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.DEBUG); - } else if ("info".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.INFO); - } else if ("warn".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.WARN); - } else if ("trace".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.TRACE); - } else if ("off".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.OFF); - } else if ("error".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.ERROR); - } - } -} +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index d1e5d7fe..bdfbbaed 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -2,6 +2,9 @@ package us.codecraft.webmagic.scripts; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; + +import us.codecraft.webmagic.scripts.languages.Language; + import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -11,14 +14,11 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class ScriptEnginePool { - private final int size; - private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { - this.size = size; this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { + return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); + } +} + +class OptionL extends CommandLineOption { + public OptionL() { + super('l'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } +} + +class OptionF extends CommandLineOption { + public OptionF() { + super('f'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } +} + +class OptionS extends CommandLineOption { + public OptionS() { + super('s'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } +} + +class OptionT extends CommandLineOption { + public OptionT() { + super('t'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } +} + +class OptionG extends CommandLineOption { + public OptionG() { + super('g'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + ConfigLogger.configLogger(commandLine.getOptionValue("g")); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java new file mode 100644 index 00000000..9e81ea6c --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scripts.config; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + +public class ConfigLogger { + /** + * Log the config parameter. If the counter is less than the number of available + * options then it means that the user entered an option + * + * @param value The config string + */ + public static void configLogger(String value) { + List> options = List.of( + Pair.of("debug", Level.DEBUG), + Pair.of("info", Level.INFO), + Pair.of("warn", Level.WARN), + Pair.of("trace", Level.TRACE), + Pair.of("off", Level.OFF), + Pair.of("error", Level.ERROR)); + Pair option = options.get(0); + int i = 1; + while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) + option = options.get(i++); + if (i < options.size()) { + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(option.getRight()); + } + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java new file mode 100644 index 00000000..b3a3209a --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.jruby.RubyHash; + +import us.codecraft.webmagic.Page; + +public class JRuby extends Language { + public JRuby() { + super("jruby","ruby/defines.rb",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java new file mode 100644 index 00000000..b0f7b647 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import us.codecraft.webmagic.Page; + +public class Javascript extends Language { + public Javascript() { + super("javascript","js/defines.js",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java new file mode 100644 index 00000000..9124d2db --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.python.core.PyDictionary; + +import us.codecraft.webmagic.Page; + +public class Jython extends Language { + public Jython() { + super("jython","python/defines.py",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java old mode 100755 new mode 100644 similarity index 51% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java index 2f9d22d5..44e6ba0a --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java @@ -1,15 +1,18 @@ -package us.codecraft.webmagic.scripts; +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; +import us.codecraft.webmagic.Page; /** - * @author code4crafter@gmail.com + * @author FrancoisGib */ -public enum Language { - - JavaScript("javascript","js/defines.js",""), - - JRuby("jruby","ruby/defines.rb",""), - - Jython("jython","python/defines.py",""); +public abstract class Language { + public Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } private String engineName; @@ -17,12 +20,6 @@ public enum Language { private String gatherFile; - Language(String engineName, String defineFile, String gatherFile) { - this.engineName = engineName; - this.defineFile = defineFile; - this.gatherFile = gatherFile; - } - public String getEngineName() { return engineName; } @@ -34,4 +31,6 @@ public enum Language { public String getGatherFile() { return gatherFile; } + + public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml deleted file mode 100755 index 474269cb..00000000 --- a/webmagic-scripts/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index ffeb9c99..b4c28521 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -2,7 +2,11 @@ package us.codecraft.webmagic.scripts; import org.junit.Ignore; import org.junit.Test; + import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com @@ -13,14 +17,14 @@ public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @@ -28,7 +32,7 @@ public class ScriptProcessorTest { @Test public void testPythonProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml deleted file mode 100755 index 1f64d8da..00000000 --- a/webmagic-scripts/src/test/resouces/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resources/log4j2-test.xml b/webmagic-scripts/src/test/resources/log4j2-test.xml new file mode 100644 index 00000000..e2fab660 --- /dev/null +++ b/webmagic-scripts/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a7a9179b..a0dc1386 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,45 +1,46 @@ - - - webmagic-parent - us.codecraft - 0.10.3 - - 4.0.0 + + + us.codecraft + webmagic + 1.0.0 + + 4.0.0 - webmagic-selenium + webmagic-selenium - - - org.seleniumhq.selenium - selenium-java - - - ${project.groupId} - webmagic-core - ${project.version} - - - com.github.detro - phantomjsdriver - - - junit - junit - - + + + org.seleniumhq.selenium + selenium-java + + + ${project.groupId} + webmagic-core + ${project.version} + + + com.github.detro + phantomjsdriver + + - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - + + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + true + + + + diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd03..b96d2894 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,15 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; @@ -22,6 +12,18 @@ import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -58,7 +60,7 @@ class WebDriverPool { * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver - * + * * @author bob.li.0718@gmail.com * @throws IOException */ @@ -73,7 +75,6 @@ class WebDriverPool { // Prepare capabilities sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); @@ -134,9 +135,9 @@ class WebDriverPool { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { - mDriver = new FirefoxDriver(sCaps); + mDriver = new FirefoxDriver(new FirefoxOptions(sCaps)); } else if (driver.equals(DRIVER_CHROME)) { - mDriver = new ChromeDriver(sCaps); + mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps)); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } @@ -144,7 +145,7 @@ class WebDriverPool { /** * check whether input is a valid URL - * + * * @author bob.li.0718@gmail.com * @param urlString urlString * @return true means yes, otherwise no. @@ -178,7 +179,7 @@ class WebDriverPool { } /** - * + * * @return * @throws InterruptedException */ diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index b7bcd80b..43ac84b5 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -1,17 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -29,10 +30,10 @@ public class SeleniumTest { Map preferences = new HashMap(); preferences.put("profile.default_content_settings", contentSettings); - DesiredCapabilities caps = DesiredCapabilities.chrome(); + DesiredCapabilities caps = new DesiredCapabilities(); caps.setCapability("chrome.prefs", preferences); caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); - WebDriver webDriver = new ChromeDriver(caps); + WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps)); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML"));