update scripts
parent
59f67b1e37
commit
bc5c30de17
@ -0,0 +1,82 @@
|
||||
package us.codecraft.webmagic.javascript;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scripts.ScriptProcessor;
|
||||
import us.codecraft.webmagic.scripts.ScriptProcessorBuilder;
|
||||
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptEngineManager;
|
||||
import javax.script.ScriptException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public class JsScriptProcessor implements PageProcessor {
|
||||
|
||||
private ScriptEngine engine;
|
||||
|
||||
private String defines;
|
||||
|
||||
private String script;
|
||||
|
||||
JsScriptProcessor(String script) throws IOException {
|
||||
ScriptEngineManager manager = new ScriptEngineManager();
|
||||
engine = manager.getEngineByName("javascript");
|
||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
|
||||
defines = IOUtils.toString(resourceAsStream);
|
||||
this.script = script;
|
||||
}
|
||||
|
||||
public static JsScriptProcessor fromFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = new FileInputStream(fileName);
|
||||
String script = IOUtils.toString(resourceAsStream);
|
||||
return new JsScriptProcessor(script);
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static JsScriptProcessor fromClassPathFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = JsScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
||||
String script = IOUtils.toString(resourceAsStream);
|
||||
return new JsScriptProcessor(script);
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
ScriptContext context = engine.getContext();
|
||||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||
try {
|
||||
engine.eval(defines + script, context);
|
||||
} catch (ScriptException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().scriptFromClassPathFile("js/oschina.js").build();
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public enum Language {
|
||||
|
||||
JavaScript("javascript","js/defines.js",""),
|
||||
|
||||
JRuby("jruby","ruby/defines.rb","");
|
||||
|
||||
private String engineName;
|
||||
|
||||
private String defineFile;
|
||||
|
||||
private String gatherFile;
|
||||
|
||||
Language(String engineName, String defineFile, String gatherFile) {
|
||||
this.engineName = engineName;
|
||||
this.defineFile = defineFile;
|
||||
this.gatherFile = gatherFile;
|
||||
}
|
||||
|
||||
public String getEngineName() {
|
||||
return engineName;
|
||||
}
|
||||
|
||||
public String getDefineFile() {
|
||||
return defineFile;
|
||||
}
|
||||
|
||||
public String getGatherFile() {
|
||||
return gatherFile;
|
||||
}
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public class ScriptProcessorBuilder {
|
||||
|
||||
private static final Language DefaultLanguage = Language.JavaScript;
|
||||
|
||||
private Language language = DefaultLanguage;
|
||||
|
||||
private String script;
|
||||
|
||||
private ScriptProcessorBuilder() {
|
||||
}
|
||||
|
||||
public static ScriptProcessorBuilder custom() {
|
||||
return new ScriptProcessorBuilder();
|
||||
}
|
||||
|
||||
public ScriptProcessorBuilder language(Language language) {
|
||||
this.language = language;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ScriptProcessorBuilder scriptFromFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = new FileInputStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream);
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
|
||||
try {
|
||||
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
|
||||
this.script = IOUtils.toString(resourceAsStream);
|
||||
} catch (IOException e) {
|
||||
//wrap IOException because I prefer a runtime exception...
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public ScriptProcessorBuilder script(String script) {
|
||||
this.script = script;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ScriptProcessor build(){
|
||||
return new ScriptProcessor(language,script);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.1
|
||||
*/
|
||||
public class ScriptProcessorTest {
|
||||
|
||||
@Test
|
||||
public void testJavaScriptProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRubyProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache" additivity="false">
|
||||
<level value="warn" />
|
||||
<appender-ref ref="stdout" />
|
||||
</logger>
|
||||
|
||||
<root>
|
||||
<level value="debug" />
|
||||
<appender-ref ref="stdout" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
Loading…
Reference in New Issue