update scripts

pull/88/head^2
yihua.huang 11 years ago
parent 59f67b1e37
commit bc5c30de17

@ -48,6 +48,7 @@
<modules> <modules>
<module>webmagic-core</module> <module>webmagic-core</module>
<module>webmagic-extension/</module> <module>webmagic-extension/</module>
<module>webmagic-scripts/</module>
</modules> </modules>
<dependencyManagement> <dependencyManagement>

@ -8,21 +8,11 @@
</layout> </layout>
</appender> </appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false"> <logger name="org.apache" additivity="false">
<level value="warn" /> <level value="warn" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />
</logger> </logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root> <root>
<level value="info" /> <level value="info" />
<appender-ref ref="stdout" /> <appender-ref ref="stdout" />

@ -0,0 +1,82 @@
package us.codecraft.webmagic.javascript;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scripts.ScriptProcessor;
import us.codecraft.webmagic.scripts.ScriptProcessorBuilder;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class JsScriptProcessor implements PageProcessor {
private ScriptEngine engine;
private String defines;
private String script;
JsScriptProcessor(String script) throws IOException {
ScriptEngineManager manager = new ScriptEngineManager();
engine = manager.getEngineByName("javascript");
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
defines = IOUtils.toString(resourceAsStream);
this.script = script;
}
public static JsScriptProcessor fromFile(String fileName) {
try {
InputStream resourceAsStream = new FileInputStream(fileName);
String script = IOUtils.toString(resourceAsStream);
return new JsScriptProcessor(script);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
}
public static JsScriptProcessor fromClassPathFile(String fileName) {
try {
InputStream resourceAsStream = JsScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
String script = IOUtils.toString(resourceAsStream);
return new JsScriptProcessor(script);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
}
@Override
public void process(Page page) {
ScriptContext context = engine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
try {
engine.eval(defines + script, context);
} catch (ScriptException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me();
}
public static void main(String[] args) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().scriptFromClassPathFile("js/oschina.js").build();
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}

@ -1,9 +1,10 @@
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.jruby;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext; import javax.script.ScriptContext;
import javax.script.ScriptEngine; import javax.script.ScriptEngine;
@ -15,7 +16,7 @@ import java.io.InputStream;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
*/ */
public class RubyScriptProcessor implements PageProcessor{ public class RubyScriptProcessor implements PageProcessor {
private ScriptEngine rubyEngine; private ScriptEngine rubyEngine;

@ -0,0 +1,35 @@
package us.codecraft.webmagic.scripts;
/**
* @author code4crafter@gmail.com
*/
public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb","");
private String engineName;
private String defineFile;
private String gatherFile;
Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
public String getEngineName() {
return engineName;
}
public String getDefineFile() {
return defineFile;
}
public String getGatherFile() {
return gatherFile;
}
}

@ -1,9 +1,9 @@
package us.codecraft.webmagic.processor; package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext; import javax.script.ScriptContext;
import javax.script.ScriptEngine; import javax.script.ScriptEngine;
@ -14,48 +14,51 @@ import java.io.InputStream;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @since 0.4.1
*/ */
public class JsScriptProcessor implements PageProcessor{ public class ScriptProcessor implements PageProcessor {
private ScriptEngine rubyEngine; private ScriptEngine engine;
private String defines; private String defines;
private String script; private String script;
public JsScriptProcessor(String filename){ private final Language language;
private Site site = Site.me();
public ScriptProcessor(Language language, String script) {
if (language == null || script == null) {
throw new IllegalArgumentException("language and script must not be null!");
}
this.language = language;
ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngineManager manager = new ScriptEngineManager();
rubyEngine = manager.getEngineByName("javascript"); engine = manager.getEngineByName(language.getEngineName());
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js"); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile());
try { try {
defines = IOUtils.toString(resourceAsStream); defines = IOUtils.toString(resourceAsStream);
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
script = IOUtils.toString(resourceAsStream);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); throw new IllegalArgumentException(e);
} }
this.script = script;
} }
@Override @Override
public void process(Page page) { public void process(Page page) {
ScriptContext context = rubyEngine.getContext(); ScriptContext context = engine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try { try {
rubyEngine.eval(defines+script, context); engine.eval(defines + script, context);
} catch (ScriptException e) { } catch (ScriptException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
@Override @Override
public Site getSite() { public Site getSite() {
return Site.me(); return site;
} }
public static void main(String[] args) {
Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run();
}
} }

@ -0,0 +1,64 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorBuilder {
private static final Language DefaultLanguage = Language.JavaScript;
private Language language = DefaultLanguage;
private String script;
private ScriptProcessorBuilder() {
}
public static ScriptProcessorBuilder custom() {
return new ScriptProcessorBuilder();
}
public ScriptProcessorBuilder language(Language language) {
this.language = language;
return this;
}
public ScriptProcessorBuilder scriptFromFile(String fileName) {
try {
InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
try {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder script(String script) {
this.script = script;
return this;
}
public ScriptProcessor build(){
return new ScriptProcessor(language,script);
}
}

@ -0,0 +1,25 @@
package us.codecraft.webmagic.scripts;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
}

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>
Loading…
Cancel
Save