commit
9d75cce16d
@ -1,9 +1,77 @@
|
||||
target
|
||||
*.iml
|
||||
out/
|
||||
.idea
|
||||
.classpath
|
||||
target/
|
||||
pom.xml.tag
|
||||
pom.xml.releaseBackup
|
||||
pom.xml.versionsBackup
|
||||
pom.xml.next
|
||||
release.properties
|
||||
dependency-reduced-pom.xml
|
||||
buildNumber.properties
|
||||
.mvn/timing.properties
|
||||
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
|
||||
.mvn/wrapper/maven-wrapper.jar
|
||||
|
||||
# Eclipse m2e generated files
|
||||
# Eclipse Core
|
||||
.project
|
||||
.settings/
|
||||
# JDT-specific (Eclipse Java Development Tools)
|
||||
.classpath
|
||||
.metadata
|
||||
bin/
|
||||
.myeclipse
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.settings/
|
||||
.loadpath
|
||||
.recommenders
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# PyDev specific (Python IDE for Eclipse)
|
||||
*.pydevproject
|
||||
|
||||
# CDT-specific (C/C++ Development Tooling)
|
||||
.cproject
|
||||
|
||||
# CDT- autotools
|
||||
.autotools
|
||||
|
||||
# Java annotation processor (APT)
|
||||
.factorypath
|
||||
|
||||
# PDT-specific (PHP Development Tools)
|
||||
.buildpath
|
||||
|
||||
# sbteclipse plugin
|
||||
.target
|
||||
|
||||
# Tern plugin
|
||||
.tern-project
|
||||
|
||||
# TeXlipse plugin
|
||||
.texlipse
|
||||
|
||||
# STS (Spring Tool Suite)
|
||||
.springBeans
|
||||
|
||||
# Code Recommenders
|
||||
.recommenders/
|
||||
|
||||
# Annotation Processing
|
||||
.apt_generated/
|
||||
.apt_generated_test/
|
||||
|
||||
# Scala IDE specific (Scala & Java development for Eclipse)
|
||||
.cache-main
|
||||
.scala_dependencies
|
||||
.worksheet
|
||||
|
||||
# Uncomment this line if you wish to ignore the project description file.
|
||||
# Typically, this file would be tracked if it contains build/dependency configurations:
|
||||
#.project
|
||||
|
@ -0,0 +1,59 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||
|
||||
public class SpiderScheduler {
|
||||
private Scheduler scheduler;
|
||||
private final ReentrantLock newUrlLock = new ReentrantLock();
|
||||
private final Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
public SpiderScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Scheduler getScheduler() {
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
public void setScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Request poll(Spider spider) {
|
||||
return scheduler.poll(spider);
|
||||
}
|
||||
|
||||
public void push(Request request, Spider spider) {
|
||||
scheduler.push(request, spider);
|
||||
}
|
||||
|
||||
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void signalNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
newUrlCondition.signalAll();
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -0,0 +1,59 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class AndSelectorTest {
|
||||
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("div"));
|
||||
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||
AndSelector andSelector = new AndSelector(selectors);
|
||||
List<String> result = andSelector.selectList(htmlContent);
|
||||
assertEquals("<div class=\"item1\">\n Item 1\n</div>", result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectList_NoResults() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("div"));
|
||||
selectors.add(new XpathSelector("//div[@class='item']"));
|
||||
AndSelector andSelector = new AndSelector(selectors);
|
||||
List<String> result = andSelector.selectList(htmlContent);
|
||||
assertEquals(0, result.size());
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
|
||||
import java.util.List;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class CssSelectorTest {
|
||||
|
||||
@Test
|
||||
public void testSelectElement() {
|
||||
CssSelector cssSelector = new CssSelector("div");
|
||||
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||
Document doc = Jsoup.parse(htmlContent);
|
||||
Element dummyElement = doc.getElementById("dummyDiv");
|
||||
Element resultElement = cssSelector.selectElement(dummyElement);
|
||||
assertNotNull(resultElement);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
CssSelector cssSelector = new CssSelector("div");
|
||||
String htmlContent = "<html><head><title>Dummy Page</title></head><body><div id=\"dummyDiv\">Hello World!</div></body></html>";
|
||||
Document doc = Jsoup.parse(htmlContent);
|
||||
Element dummyElement = doc.getElementById("dummyDiv");
|
||||
List<String> result = cssSelector.selectList(dummyElement);
|
||||
assertEquals(1, result.size());
|
||||
assertEquals("[<div id=\"dummyDiv\">\n Hello World!\n</div>]", result.toString());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class OrSelectorTest {
|
||||
@Test
|
||||
public void testSelectList() {
|
||||
String htmlContent = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
"<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>\n" +
|
||||
"<body>\n" +
|
||||
" <div class=\"container\">\n" +
|
||||
" <div class=\"item1\">Item 1</div>\n" +
|
||||
" <div class=\"item2\">Item 2</div>\n" +
|
||||
" </div>\n" +
|
||||
"</body>\n" +
|
||||
"</html>";
|
||||
String expectedResult = "[<head>\n" +
|
||||
" <meta charset=\"UTF-8\">\n" +
|
||||
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
|
||||
" <title>HTML with XPath</title>\n" +
|
||||
"</head>, <div class=\"item1\">\n" +
|
||||
" Item 1\n" +
|
||||
"</div>, <div class=\"item2\">\n" +
|
||||
" Item 2\n" +
|
||||
"</div>]";
|
||||
List<Selector> selectors = new ArrayList<Selector>();
|
||||
selectors.add(new CssSelector("head"));
|
||||
selectors.add(new XpathSelector("//div[@class='item1']"));
|
||||
selectors.add(new XpathSelector("//div[@class='item2']"));
|
||||
OrSelector orSelector = new OrSelector(selectors);
|
||||
List<String> result = orSelector.selectList(htmlContent);
|
||||
assertEquals(expectedResult, result.toString());
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class CharsetUtilsTest {
|
||||
|
||||
@Test
|
||||
void testDetectCharset() throws IOException {
|
||||
assertNull(CharsetUtils.detectCharset(null, new byte[0]));
|
||||
}
|
||||
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@ -1,58 +1,33 @@
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
import us.codecraft.webmagic.model.sources.Source;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Wrapper of field and extractor.
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.2.0
|
||||
*/
|
||||
class FieldExtractor extends Extractor {
|
||||
public class FieldExtractor extends Extractor {
|
||||
|
||||
@Getter
|
||||
private final Field field;
|
||||
|
||||
@Getter @Setter
|
||||
private Method setterMethod;
|
||||
|
||||
@Getter @Setter
|
||||
private ObjectFormatter objectFormatter;
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
super(selector, source, notNull, multi);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
void setSetterMethod(Method setterMethod) {
|
||||
this.setterMethod = setterMethod;
|
||||
}
|
||||
|
||||
Method getSetterMethod() {
|
||||
return setterMethod;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
ObjectFormatter getObjectFormatter() {
|
||||
return objectFormatter;
|
||||
}
|
||||
|
||||
void setObjectFormatter(ObjectFormatter objectFormatter) {
|
||||
this.objectFormatter = objectFormatter;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,42 @@
|
||||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
|
||||
public class MultipleField extends PageField {
|
||||
@Getter
|
||||
private List<String> fieldNames;
|
||||
|
||||
public MultipleField(List<String> fieldNames) {
|
||||
this.fieldNames = fieldNames;
|
||||
}
|
||||
|
||||
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
|
||||
return false;
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
|
||||
setField(o, fieldExtractor, converted);
|
||||
}
|
||||
else
|
||||
setField(o, fieldExtractor, this.fieldNames);
|
||||
return true;
|
||||
}
|
||||
|
||||
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
|
||||
List<Object> objects = new ArrayList<>();
|
||||
for (String value : values) {
|
||||
Object converted = this.convert(value, objectFormatter, logger);
|
||||
if (converted != null)
|
||||
objects.add(converted);
|
||||
}
|
||||
return objects;
|
||||
}
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
|
||||
|
||||
public abstract class PageField {
|
||||
public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
|
||||
|
||||
protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
|
||||
try {
|
||||
Object format = objectFormatter.format(value);
|
||||
logger.debug("String {} is converted to {}", value, format);
|
||||
return format;
|
||||
} catch (Exception e) {
|
||||
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (value != null) {
|
||||
if (fieldExtractor.getSetterMethod() != null)
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
fieldExtractor.getField().set(o, value);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package us.codecraft.webmagic.model.fields;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public class SingleField extends PageField {
|
||||
@Getter
|
||||
private String fieldName;
|
||||
|
||||
public SingleField(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getObjectFormatter() != null) {
|
||||
Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
|
||||
if (converted == null && fieldExtractor.isNotNull())
|
||||
return false;
|
||||
setField(o, fieldExtractor, converted);
|
||||
} else
|
||||
setField(o, fieldExtractor, this.fieldName);
|
||||
return true;
|
||||
}
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
public interface BasicClassDetector {
|
||||
Class<?> detectBasicClass(Class<?> type);
|
||||
}
|
||||
|
||||
class IntegerClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class LongClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class DoubleClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class FloatClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ShortClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class CharacterClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ByteClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class BooleanClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
|
||||
public interface Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
|
||||
|
||||
public class RawHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
}
|
||||
}
|
||||
|
||||
public class SelectedHtml implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
if (isRaw)
|
||||
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||
else
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
|
||||
public class Url implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
||||
public class RawText implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(page.getRawText());
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(page.getRawText());
|
||||
}
|
||||
}
|
||||
|
||||
public class DefaultSource implements Source {
|
||||
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
|
||||
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
return fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,17 @@
|
||||
package us.codecraft.webmagic.model.sources;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.FieldExtractor;
|
||||
import us.codecraft.webmagic.model.fields.MultipleField;
|
||||
import us.codecraft.webmagic.model.fields.PageField;
|
||||
import us.codecraft.webmagic.model.fields.SingleField;
|
||||
|
||||
public class SourceTextExtractor {
|
||||
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
|
||||
Source source = fieldExtractor.getSource();
|
||||
if (fieldExtractor.isMulti())
|
||||
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
|
||||
else
|
||||
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@ -1,26 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.springframework" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<logger name="net.sf.ehcache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -0,0 +1,19 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.springframework" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Logger name="net.sf.ehcache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@ -0,0 +1,47 @@
|
||||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import us.codecraft.webmagic.scripts.languages.JRuby;
|
||||
import us.codecraft.webmagic.scripts.languages.Javascript;
|
||||
import us.codecraft.webmagic.scripts.languages.Language;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
public class Params {
|
||||
@Getter
|
||||
Language language = new Javascript();
|
||||
|
||||
@Getter @Setter
|
||||
String scriptFileName;
|
||||
|
||||
@Getter @Setter
|
||||
List<String> urls;
|
||||
|
||||
@Getter @Setter
|
||||
int thread = 1;
|
||||
|
||||
@Getter @Setter
|
||||
int sleepTime = 1000;
|
||||
|
||||
private static Map<Language, Set<String>> alias;
|
||||
|
||||
public Params() {
|
||||
alias = new HashMap<Language, Set<String>>();
|
||||
alias.put(new Javascript(), WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||
alias.put(new JRuby(), WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||
}
|
||||
|
||||
public void setLanguagefromArg(String arg) {
|
||||
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
|
||||
if (languageSetEntry.getValue().contains(arg)) {
|
||||
this.language = languageSetEntry.getKey();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,82 @@
|
||||
package us.codecraft.webmagic.scripts.config;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
|
||||
import lombok.Getter;
|
||||
import us.codecraft.webmagic.scripts.Params;
|
||||
|
||||
public abstract class CommandLineOption {
|
||||
@Getter
|
||||
char option;
|
||||
|
||||
public CommandLineOption(char option) {
|
||||
this.option = option;
|
||||
}
|
||||
|
||||
protected abstract void addParamOption(Params params, CommandLine commandLine);
|
||||
|
||||
public void addParamOptionIfInCommandLine(Params params, CommandLine commandLine) {
|
||||
if (commandLine.hasOption(this.option))
|
||||
this.addParamOption(params, commandLine);
|
||||
}
|
||||
|
||||
public static List<CommandLineOption> getAllOptions() {
|
||||
return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
|
||||
}
|
||||
}
|
||||
|
||||
class OptionL extends CommandLineOption {
|
||||
public OptionL() {
|
||||
super('l');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
String language = commandLine.getOptionValue("l");
|
||||
params.setLanguagefromArg(language);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionF extends CommandLineOption {
|
||||
public OptionF() {
|
||||
super('f');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
String scriptFilename = commandLine.getOptionValue("f");
|
||||
params.setScriptFileName(scriptFilename);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionS extends CommandLineOption {
|
||||
public OptionS() {
|
||||
super('s');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
|
||||
params.setSleepTime(sleepTime);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionT extends CommandLineOption {
|
||||
public OptionT() {
|
||||
super('t');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
|
||||
params.setThread(thread);
|
||||
}
|
||||
}
|
||||
|
||||
class OptionG extends CommandLineOption {
|
||||
public OptionG() {
|
||||
super('g');
|
||||
}
|
||||
|
||||
protected void addParamOption(Params params, CommandLine commandLine) {
|
||||
ConfigLogger.configLogger(commandLine.getOptionValue("g"));
|
||||
}
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package us.codecraft.webmagic.scripts.config;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.logging.log4j.Level;
|
||||
import org.apache.logging.log4j.core.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class ConfigLogger {
|
||||
/**
|
||||
* Log the config parameter. If the counter is less than the number of available
|
||||
* options then it means that the user entered an option
|
||||
*
|
||||
* @param value The config string
|
||||
*/
|
||||
public static void configLogger(String value) {
|
||||
List<Pair<String, Level>> options = List.of(
|
||||
Pair.of("debug", Level.DEBUG),
|
||||
Pair.of("info", Level.INFO),
|
||||
Pair.of("warn", Level.WARN),
|
||||
Pair.of("trace", Level.TRACE),
|
||||
Pair.of("off", Level.OFF),
|
||||
Pair.of("error", Level.ERROR));
|
||||
Pair<String, Level> option = options.get(0);
|
||||
int i = 1;
|
||||
while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
|
||||
option = options.get(i++);
|
||||
if (i < options.size()) {
|
||||
Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
|
||||
rootLogger.setLevel(option.getRight());
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import org.jruby.RubyHash;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class JRuby extends Language {
|
||||
public JRuby() {
|
||||
super("jruby","ruby/defines.rb","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
|
||||
Iterator itruby = oRuby.entrySet().iterator();
|
||||
while (itruby.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) itruby.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class Javascript extends Language {
|
||||
public Javascript() {
|
||||
super("javascript","js/defines.js","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
engine.eval(defines + "\n" + script, engine.getContext());
|
||||
}
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package us.codecraft.webmagic.scripts.languages;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
import org.python.core.PyDictionary;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
public class Jython extends Language {
|
||||
public Jython() {
|
||||
super("jython","python/defines.py","");
|
||||
}
|
||||
|
||||
public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
|
||||
engine.eval(defines + "\n" + script, engine.getContext());
|
||||
PyDictionary oJython = (PyDictionary) engine.get("result");
|
||||
Iterator it = oJython.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry) it.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
|
||||
}
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="error" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="info" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -1,21 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="debug" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="stdout" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache" level="warn" additivity="false">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Logger>
|
||||
<Root level="debug">
|
||||
<AppenderRef ref="stdout" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@ -1,45 +1,46 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.10.3</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="
|
||||
http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-selenium</artifactId>
|
||||
<artifactId>webmagic-selenium</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.detro</groupId>
|
||||
<artifactId>phantomjsdriver</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.detro</groupId>
|
||||
<artifactId>phantomjsdriver</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>3.0.0-M1</version>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
Loading…
Reference in New Issue