diff --git a/asserts/logo-simple.jpg b/asserts/logo-simple.jpg new file mode 100644 index 00000000..366aa627 Binary files /dev/null and b/asserts/logo-simple.jpg differ diff --git a/pom.xml b/pom.xml index 2aa3df7a..eba68c85 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,7 @@ webmagic-core webmagic-extension/ + webmagic-scripts diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9406f3ab..bb1b8688 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -45,6 +45,16 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable css(String selector) { + return $(selector); + } + + @Override + public Selectable css(String selector, String attrName) { + return $(selector, attrName); + } + @Override public Selectable smartContent() { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 66df5d5b..6b4410e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -35,6 +35,23 @@ public interface Selectable { */ public Selectable $(String selector, String attrName); + /** + * select list with css selector + * + * @param selector css selector expression + * @return new Selectable after extract + */ + public Selectable css(String selector); + + /** + * select list with css selector + * + * @param selector css selector expression + * @param attrName attribute name of css selector + * @return new Selectable after extract + */ + public Selectable css(String selector, String attrName); + /** * select smart content with ReadAbility algorithm * diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml new file mode 100644 index 00000000..99aac59b --- /dev/null +++ b/webmagic-scripts/pom.xml @@ -0,0 +1,35 @@ + + + + webmagic-parent + us.codecraft + 0.4.1-SNAPSHOT + + 4.0.0 + + us.codecraft + webmagic-scripts + 0.4.1-SNAPSHOT + + + + org.jruby + jruby + 1.7.6 + + + junit + junit + test + + + us.codecraft + webmagic-core + ${project.version} + + + + + \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java new file mode 100644 index 00000000..b821ae48 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.processor; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + */ +public class ScriptProcessor implements PageProcessor{ + + private ScriptEngine rubyEngine; + + private String defines; + + ScriptProcessor(){ + ScriptEngineManager manager = new ScriptEngineManager(); + rubyEngine = manager.getEngineByName("jruby"); + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb"); + try { + defines = IOUtils.toString(resourceAsStream); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public void process(Page page) { + ScriptContext context = rubyEngine.getContext(); + context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); + String script; + try { + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb"); + try { + script = IOUtils.toString(resourceAsStream); + rubyEngine.eval(defines+script, context); + } catch (IOException e) { + e.printStackTrace(); + } + } catch (ScriptException e) { + e.printStackTrace(); + } + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb new file mode 100644 index 00000000..6d3cbd86 --- /dev/null +++ b/webmagic-scripts/src/main/resources/ruby/defines.rb @@ -0,0 +1,11 @@ +def xpath str + $page.getHtml().xpath(str).toString() +end +def css str + $page.getHtml().css(str).toString() +end +def urls str + links = $page.getHtml().links().regex(str).all(); + $page.addTargetRequests(links); +end + diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb new file mode 100644 index 00000000..225f8224 --- /dev/null +++ b/webmagic-scripts/src/main/resources/ruby/oschina.rb @@ -0,0 +1,5 @@ +title = css "div.BlogTitle h1" +content = css "div.BlogContent" +urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" +puts title +puts content \ No newline at end of file diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java new file mode 100644 index 00000000..c2965171 --- /dev/null +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.jruby; + +import org.junit.Test; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; + +/** + * @author code4crafter@gmail.com + */ +public class TestJRubyCall { + + @Test + public void test() throws ScriptException { + ScriptEngineManager manager = new ScriptEngineManager(); + ScriptEngine rubyEngine = manager.getEngineByName("jruby"); + ScriptContext context = rubyEngine.getContext(); + + context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE); +// rubyEngine.eval("", context); + rubyEngine.eval("b=1; puts b", context); + } +}