update scripts

pull/88/head^2
yihua.huang 11 years ago
parent 59f67b1e37
commit bc5c30de17

@ -48,6 +48,7 @@
<modules>
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-scripts/</module>
</modules>
<dependencyManagement>

@ -8,21 +8,11 @@
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />

@ -0,0 +1,82 @@
package us.codecraft.webmagic.javascript;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scripts.ScriptProcessor;
import us.codecraft.webmagic.scripts.ScriptProcessorBuilder;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class JsScriptProcessor implements PageProcessor {
private ScriptEngine engine;
private String defines;
private String script;
JsScriptProcessor(String script) throws IOException {
ScriptEngineManager manager = new ScriptEngineManager();
engine = manager.getEngineByName("javascript");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
defines = IOUtils.toString(resourceAsStream);
this.script = script;
}
public static JsScriptProcessor fromFile(String fileName) {
try {
InputStream resourceAsStream = new FileInputStream(fileName);
String script = IOUtils.toString(resourceAsStream);
return new JsScriptProcessor(script);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
}
public static JsScriptProcessor fromClassPathFile(String fileName) {
try {
InputStream resourceAsStream = JsScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
String script = IOUtils.toString(resourceAsStream);
return new JsScriptProcessor(script);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
}
@Override
public void process(Page page) {
ScriptContext context = engine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
try {
engine.eval(defines + script, context);
} catch (ScriptException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().scriptFromClassPathFile("js/oschina.js").build();
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}

@ -1,9 +1,10 @@
package us.codecraft.webmagic.processor;
package us.codecraft.webmagic.jruby;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
@ -15,7 +16,7 @@ import java.io.InputStream;
/**
* @author code4crafter@gmail.com
*/
public class RubyScriptProcessor implements PageProcessor{
public class RubyScriptProcessor implements PageProcessor {
private ScriptEngine rubyEngine;

@ -0,0 +1,35 @@
package us.codecraft.webmagic.scripts;
/**
* @author code4crafter@gmail.com
*/
public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb","");
private String engineName;
private String defineFile;
private String gatherFile;
Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
public String getEngineName() {
return engineName;
}
public String getDefineFile() {
return defineFile;
}
public String getGatherFile() {
return gatherFile;
}
}

@ -1,9 +1,9 @@
package us.codecraft.webmagic.processor;
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
@ -14,48 +14,51 @@ import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class JsScriptProcessor implements PageProcessor{
public class ScriptProcessor implements PageProcessor {
private ScriptEngine rubyEngine;
private ScriptEngine engine;
private String defines;
private String script;
public JsScriptProcessor(String filename){
private final Language language;
private Site site = Site.me();
public ScriptProcessor(Language language, String script) {
if (language == null || script == null) {
throw new IllegalArgumentException("language and script must not be null!");
}
this.language = language;
ScriptEngineManager manager = new ScriptEngineManager();
rubyEngine = manager.getEngineByName("javascript");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
engine = manager.getEngineByName(language.getEngineName());
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile());
try {
defines = IOUtils.toString(resourceAsStream);
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
e.printStackTrace();
throw new IllegalArgumentException(e);
}
this.script = script;
}
@Override
public void process(Page page) {
ScriptContext context = rubyEngine.getContext();
ScriptContext context = engine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
rubyEngine.eval(defines+script, context);
engine.eval(defines + script, context);
} catch (ScriptException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me();
return site;
}
public static void main(String[] args) {
Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}

@ -0,0 +1,64 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorBuilder {
private static final Language DefaultLanguage = Language.JavaScript;
private Language language = DefaultLanguage;
private String script;
private ScriptProcessorBuilder() {
}
public static ScriptProcessorBuilder custom() {
return new ScriptProcessorBuilder();
}
public ScriptProcessorBuilder language(Language language) {
this.language = language;
return this;
}
public ScriptProcessorBuilder scriptFromFile(String fileName) {
try {
InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
try {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder script(String script) {
this.script = script;
return this;
}
public ScriptProcessor build(){
return new ScriptProcessor(language,script);
}
}

@ -0,0 +1,25 @@
package us.codecraft.webmagic.scripts;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
}

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>
Loading…
Cancel
Save