From df8ca8ad092307f95cf673c8db606edb57a12247 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 10 Nov 2013 22:30:48 +0800 Subject: [PATCH] add scripts --- asserts/logo-simple.jpg | Bin 0 -> 8657 bytes pom.xml | 1 + .../webmagic/selector/PlainText.java | 10 +++ .../webmagic/selector/Selectable.java | 17 +++++ webmagic-scripts/pom.xml | 35 ++++++++++ .../webmagic/processor/ScriptProcessor.java | 61 ++++++++++++++++++ .../src/main/resources/ruby/defines.rb | 11 ++++ .../src/main/resources/ruby/oschina.rb | 5 ++ .../webmagic/jruby/TestJRubyCall.java | 25 +++++++ 9 files changed, 165 insertions(+) create mode 100644 asserts/logo-simple.jpg create mode 100644 webmagic-scripts/pom.xml create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java create mode 100644 webmagic-scripts/src/main/resources/ruby/defines.rb create mode 100644 webmagic-scripts/src/main/resources/ruby/oschina.rb create mode 100644 webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java diff --git a/asserts/logo-simple.jpg b/asserts/logo-simple.jpg new file mode 100644 index 0000000000000000000000000000000000000000..366aa6276185d8b1c946aae4c3e453fdc377e1b9 GIT binary patch literal 8657 zcmbW62Ut@}x3G5#y;tc)dKc-vNpA{B6O9mhmkt6^RFEPlRf?d1D4;|@iYOpmk=}dn zQW9!_KyEna|Ia!1d!Bp$@BdbwnY@`vj^mPCb2mnC%4}e=B zx7OFvf?YQ=)zLT7{-c9j&&bW&2TTnBUfuzIX1bbO);6|WL_Gi)AOQ#gNdT~i`}-Q3 z>sSDI(bv}E^2aM(@ZWMca`Eo~Fes*Xor~-5_J50L;J$tVcmuEFwWS=L{NZ?hjAzf_ z0N;yt6rSlFJT4dvyDMSzzOb(=PP(7_p>n5!m}~}5R<$7 zgB||C0Zt)!I{`q;$2Zi^&BZlmz?qToe$ff1uY zHlKU|kUzk$&GWz895Ml*<_Q2WP5j-)`x*eK9s@vi??3!uyx1?U0Rg^>5)#3|!QyUC zaPf-;{Y(Cz3V&(-&+w1=#4qOiYwftMIXT-0dIoS^EGpc`(bMtn=huhoHEx^sko6F7ne~R#bIP4!bT;QMk8V_`Z z=KwuS9H8!{2OwKr0F;glfY{~Ydq981&4|PjxHx%MoU4ECdpzU&|ET|41x~{M1pB+W za9uQOnOShb1O0+77=I=%DguBMpaSRtW`G0W0R#at;3^;oC;{q#Hedjl0v3P`a0h?` zu7DTd2LuD*z(XJ!hzFhnX+Rc`2NVLuKp9X0)BxXrCZHYo1@r;Kz$7pSECas*6mSTf zf(-~$K%A%ieNxFKQ?d59Lo6k-c;hWJ4uA#sp&NIs+tQU~dP3_|80 zn~-BDA(Rfv1r>uTLG_?Fp>XIuXe2ZNngx9it%kNj2ch#&6!e^cf`E-cgg}u%pTL^H zg&>$9njoE^h@gs~jbNBync$F+kdTp3fKZN5kI;tDo$x+k0^ut{B;j|$e!@k<10o_K zW+D+H6(Un2d!hiMXrfG_4@8Yb14PS27-BMFPU5S?I>ffb-o#PF>BJ?(4a5V)tHh@y z)FgZ)iX^5ajwGQZNhF0NbtJtc%Ot0yG^7HgDx?;q?xYV%Uy_!QwvkSe?vas^T_#f? zyFung_JHgq*(b6eWbateQUp*WQWR6P zP|Q$ZDCsE0DfKBGDG`*}l+~02l-pD!RD4t#RClPtsM4t_sd}k4sY$5$skNx>sUxYg zslQNn;eWB>KyJIsT>U)>zs6) zs+?||sho|R8(a)r8eE=SFS%N|P~2?Xdfb8BZ@7E7PcI8xzIi$7GV<~?4=Im4k26my zPcsjSmy_3+H=MVWcY=?EPoB?}FN3dx?}%T3--5y()Yael`2*kQ9ZK zjub+wRthaGBJCudD?KJdCu1xVBhxH%CMzfFCtD`FF2^f(S1w0xRGwb`x_pBC4+R1R zO$CI)H-%$GS;YXw3dJ2IF(nVB5~Vd|0c9uULghu3%PRINZ&c<~xm53}zE+)6<5sg* z%U4@a=TUc5FH&FC5Ylkh_@J?^c}3Gtvr6+wOHnIA>-#mxHQj4**Lt*RwQp)?Y0v0f z)^X7()j{dX=-$`;u1BC}sP{r|RG(cRu3xN=GLSQfFlaR-H#9fQGF&hcHu5v7GX@(Q z7^fIdnedr-o79*BruwESrqkC2uHU=<^#o>A*te8ofg`0Jl)0xA}OUw@~G%b=W zCT|Mf47}NFNo8qgS!{V^bP#w4U3JNO|31Ft)*>|?ZGYWTWPnJ?PTnt z?M7}3+zz?@^A5`$&pY)nDwsX2@-D$$tGgw4vG%6+`Su46`VKh`D7Y3p1OD4l-7(d1 z-ATnM*=f~T#W~q|%|*o}#bw=9%{AS1(@o1Q%MI z6Kok=8A21{9nuvl92y(C8m1HW_CEAJ{C;ydclg8b`3Q}OH<6%7hsb6G58^Rm`GM|( z_YX-QdOYlox)PNfwg1TKQQc$q#}6JaKGA(r8ciAP7d;xI6qEN9`qb@dPpnjIX6$*K zV_awamH6~{Yyv!?^O@wcm(OsC&WSxqvPrqm37>mEA9V6t5?Pt1ldR_);MqtL=OQV;ynY@|LGqG8&S;N_y*~lEWoY;Iupp>lrO>jlmMbnUVJk^0 z1(ybvu79xm(D(7$$J#QnvNuRZWc(-KQ{bnK^1J0D6^0efmGYHkpZPxLRMAz%RfDQS zt5G$sHFLE#wf$cVzBJb<)m42J|N8D5_qXhNhWexi(uT(kxW=%?gYUlIH=A6V7Mo$s zlP%UQL#;Pjd)f@!ezfbfw{&Q9H2zTe@$IML&o7JeTHo&e^zJ~IVU?;KYwk$Yr%YBauL4xd&z$ZyZm^CVkLc*bG3Nw>e{z;o%Ozr zTN}&2eSe>BMsLw<HX$|>jyK3UWcbgaTsRI z`(ydzjuWes)ze@s5%%So&{^HN;rSHK3x~r^0$Kq0LO_4iU@!zi2!-H3MHf@Att%d`2Q2!cYuZ% zcm+IxfGz`I8W4mAglh#j@i`3vK7GK;MXquoATR_qqfQIrv49kTPCkVrJpv7Z4N@ zmX?u~lUGpGx~8q8tEX>ZZgJDn3ZH1f9i5zAT;1IL0|JABLqfwIJ$@1$^E5UtIVCkM zJ>z9&R(?U@+oE^xi%TjhKUY=P)PAXJZfR|6@A&bvb6{|2cw}^Jd}4lKacOyFb!~kE zy}P%6aCn3{KDqD<1VH|D>o3p##V;DXUtlN{0wuce3j_|naGVB8z%5Bgt8s(K{vO@s zs}G6kHDA0bZzAE5GD9&q`1O-A@=DM1p)Xwf!?Sea1@e6GTk!mCtA*;yxp=Yb`U}s*xy82PH8;$zQg4%6c~H#=Xoii#V62$S zhdrz3q^oES57w_9lWqajQPzBvPtrsmmFei*OiN8{rI~2>fm9M}H>4x^a%(y@{Z&d| z^AjuS$+Zbm!TfqU-+#>dPYVEKLW3+8K2o*A_O5C6@s8L`W2oGCpwg-3iWK&Gd&~Bm z;}0Cr3}XHvzm9O-N!SaKyO#9*oaWUMx~$}(<9$SyAyeK1E4E`~zwUKaZ_25y$lg4H ze2mu|a;rOtKBN64M}1n#{#mLERQ=uQG}()xIX`;3y%6k8J7?}W)b`JCzs;`_a+ru! z3isqvH7BaVh;$x?W=q^8roA$Y{FkH_ykXb{-IAHFRKNxh0gMEnu z2dpjPfY^7Ko%x#B2Ktcg)rGvK_puf5##W9?>PAxSlr{w#SQV50PCu)J_j|h?x1c4R zSu1auf65H=4F~VRq~RK!8B&Sm#_MAzyK!@oY~ea69MC!&b0y@FW@-Usd1hR%rTcRR zTLI$AG%;3vr*unF>j!$cE`$h?XNv5~7q>ze{6PslYk z2*RwA0>{D$PL(k!w&iiG+5-U@j5L>lr}IIYuLHa+pDSN zmnq5m+aAmQp5YY+)GhM|IaPj$yzZmTP)uAe83D2Pk;}J@JkF|Lc@u9`h1bIv)8NIr zPKSCQyd6UqCd*SqE#3%FOfqb9(419@bJ|ADc5pCO#&zm|4X4?sm~@L{e|dA%-i-jQ z!$|sJ+%XGm5hP!9ine#Ooo80F$(>xk7DdKk28hq;!i@$Z6|pLl5pxv5zhHFMHi+VY z)Man#0A>N6BF@A_?TCn?4OzvGn9Zu&vMWm6s=EZa6Cpj8*Y1*jI(Y;Wi2Rju3p04A zG>TTTMgwt zxS>rC{8_lU!1@4XVIR2v`YpT3J2txhZuba=za|uZ`oYec(LjVp3! zS4uZ*mMcF_WXV+ijQT9TaVZ@$c-~x&SSuNP#d35`M&>*b`u!7XT`{+3?)#iK)+uKH z1LxSu$)F*I6AdwMd?n^7CXs$WHgCHhXag68 zAqD9s>2Xdp5+JmkPAtR9{E;_VDn|PSDXq$P&$yBN)cnG%h-<_BsGZHm1>;-IjSSe^ zC|Ijm@yb;ipLZ;Ga`@zvBmY~hUyqxM*>jr7iOx92}7 znz(&@r#L4>uJf(&&LvEMgjFws9n0>57Ma5igt(s%thV1_&qPZ1R@jLc*Qu^$VDM(LOh7!8j0%I6!IRRzYwjn5un`QCr->sVk?HY8W9f6XQB&-d|%3)v{# z@ye!D!V(Lnq2U=S<~s>C6g}8}&f51)zfu#m)l`#V`=z2cPtYOTQYLblS7Ewe<6P9L z6bH1>Z*cBk-^2kZqgiZFqjc-?mm(*=TjMjo(vNIw*Jkh?RHOlt46||)8B^hM+7Ul- z0J-^lM88%g4rr1>peH;c+Z3NaiwoB*cj@S4)w@e_r0Uk@k0=>`hdsf<&m9pi3u+m= z$`aojc!#&o31Q)Us4hQ!WXRW68eJZ7b;CgI9|E7mqD&oidu`85P)X6BaR32UvJWfy z&fatXx5Y!Y1ZGxD4i5O-WytC0vF9^S)MQE>lF`v1T2YQ}n|*+V&1W}O+I{>+wc_Qq zX2Yvby(l<`PGDHtMiyZuPsPS;Fe@#ss&eS|`o-*D3L_!*nMOA<%!p@fre8X`00~En zaVLq)iLyELXGt<=LsQB60%ckCI}wyv(vq}epTUS12VV46-JG8#+D}bm8pvx=D-T22 z8sr<}-H9g!OrFI^$ttRJ#WM`vF)u$C{l@?jUPdLDrnl$Sqtacan;TOf0NMo4S*C10zrzfkvE4e!`b^OsD zRn&|Vd`;=!c83;q^emk7-fK*@QO$6`PZJ|Q$E#P?)>_i+Q=*8K^K6eDOrv0ok&GKD zGgt<60c>qo@Af{sO{&Y7H^(i`!^WVQm4?nTpEiywsE#PCYM?3o?;sYjJSE!&(=1J; z%jA`LPSDZJmrfY>+4}d#c98=?9;@eJOqK;EclJMfly&q_V;wN)H(W_=2(ZtA&83)( zYiU1D9z@xn47Z-WvmE#T7|xGLDWW+#9NWR4&UOAfI}4=pobw^cx3|1+Fos=3$5Vz_>{}rba-+PX^jKo>u~@@5Ml{`Cm~JR zRNL?E*;7WI9yfO5gXM_>+#B4^(v<^U=Eclg5fs?FN%POlHn4ZbI^J{FO!HhWXw_Qu zu%~{n63TuruS_{&0Y($z-!x!2k09D;5F05nOMfxR)GDbz{+6$WU}yJH4xC4!44d^< zVL3HV$}W5Ie$l>1#Qi{-=1%d)fs^G!9-I?-sHYo^V|mW!6Q^vAcTaUskZSh3M>wGJ zEAwF)>%1;j6p6VBLkC8-Dw`hul#R&i?0xZMys`6G&IK`j&i-XprrqPtWz1G#I6cbp zqu1QorCVsV-0p>EvNIZA9HxSvHiUbNJZtr734l7+o0_7%3TLU*=6OdyAipuY#@2-# z46;!WLz2fFfF505y2Mu^BTqss$_l3noGR-qgMuT(t7nZ((qx2YZ#wO&8dacco-}ac zZ)U0@FieV7{Iz~>>8IxY*S2|cI-D^RaVCh7+}x@)+tDo#RJM^{R(Jy!n$_#rw!>7i zD1zi=JA#@beDHO+ImSkA>jwH#d!#CAGnacH(5=}O-Cfug^z>=aaYDt5s_M%D&|Tfp z4O1}I^z4;HRV@$dJ1Qf(%H{55{Q$?oS%KHIo}{Lpgx%row_cirqu$=3F-y&As2FaP zP9$Z>u1gsB8u>NX!yg^9Skp!(QI3_s1P0E(QVifZbw+K=_lhQW(sFlPht$034#*>K zbS4ucYx7vxd3sWUX=yzVo`AAuZYAFyzpA9xEwxG{3iSA~MyY;=(meO0VAwJLF)tif z#_wI2KH4v#_fbu%O#a$Lnr;^kpsK~Eokx_P{!iWs=~Wd9*Z71_H+{^|+6#Pb%?_+& zW@D$?EiMib@}h`wa~LmH8Nt6Hw?DQyV6DBEntAi5?@SXn&t&KUvg`rrYy5i`fO2@xzqv!Q z-E`8!BwJrR(ekTLeOaDB_wf$Q>J0=vqMf8vkZNWXW}I-TCFXgPoU&qKNC#N%ion z46UALf(yc5CZ}z~@`MpBN?1d0-m^mGko76AFF0VSZr3!v5eV0s=Oi2t>4DMyhS7c4 zUvJIo&V9=p*G!&J_k#Owy%x0}i0HM<38zuA{7OwSbAi4?pzzx7w7Vf(kA|$?#Lj)1EOpPl{ z`F~Z|u|u|Uj@QihCUR%344T47ekV{)P7BLibH)Kh&r#2|bw<=?ka^)Q1;6tuIk)J1 z7GWgXGQBPJ?p_&TL5s`p__C&QTmAYam+n4#&Kf+`z=RfmwnznDpW*2fA2~9tf2wRE z&r@#6{*o<9iNBk1eA+43H>0>A3pR6kuSlgZTQEN`Fwl6mA`Fd3>lShqVJK)~ELX?-qu zdO5DEU&dNymT+Bh`h$Oq*L$0`Kzje8pA3%b->+<{E5{Auw`yv#>z2vIGawxQVE%v! zo-id^Bz?6c%}Qhs2unEeRTGY&zYc;4NA^cb4I{Td`$V0S!R6w@jXG=ESUI$d1(qFD z*E}WR6EdC&SCA#Y_m&Ub8dwlb7FFyn9Xr_+w))zM`HM@@B+)HPL8m$(0# z5@-xi*oYu}PTk3@=fuPHXe7P9MB3gU>oG&}*hYednc-dhUEVA)xUGS4K-mqoXMyVZ z0_2%AJZ{H^_ve;u+`pQ0MK!;H^(-myqx0m#PW(4+_VHTO!kEgu){$jEy8Q6ypkH}! z+3-H{3)ZTXf@nVG=|Q_!f$6}uI6kklTXa58LuYK-6wr7rHd6oiCUt9E69=g8aGk-C z8C?sxK983N?i|dfRbzWdH!xX2v*z2Kg82=vcP`fUzZEnrQH|tdq;R;A zn8K;LQ2&il|Jhoe9LC+}=R&l3?b)ZA;i71lw%ck=C#ErF`@d9FM|N!jg(Uc##?!QQ zZUWm}p*i=4!je@+TTaB@#VC^%xH1A~NyZg%BW#HV?GH6nSD950<_24C=K3fOJ}Y2) ikCZ(K!Y1ksrnG*M$+y(0$6JUV`M(cM|IRl8H~wGaq@|Yt literal 0 HcmV?d00001 diff --git a/pom.xml b/pom.xml index 2aa3df7a..eba68c85 100644 --- a/pom.xml +++ b/pom.xml @@ -48,6 +48,7 @@ webmagic-core webmagic-extension/ + webmagic-scripts diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9406f3ab..bb1b8688 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -45,6 +45,16 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable css(String selector) { + return $(selector); + } + + @Override + public Selectable css(String selector, String attrName) { + return $(selector, attrName); + } + @Override public Selectable smartContent() { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 66df5d5b..6b4410e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -35,6 +35,23 @@ public interface Selectable { */ public Selectable $(String selector, String attrName); + /** + * select list with css selector + * + * @param selector css selector expression + * @return new Selectable after extract + */ + public Selectable css(String selector); + + /** + * select list with css selector + * + * @param selector css selector expression + * @param attrName attribute name of css selector + * @return new Selectable after extract + */ + public Selectable css(String selector, String attrName); + /** * select smart content with ReadAbility algorithm * diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml new file mode 100644 index 00000000..99aac59b --- /dev/null +++ b/webmagic-scripts/pom.xml @@ -0,0 +1,35 @@ + + + + webmagic-parent + us.codecraft + 0.4.1-SNAPSHOT + + 4.0.0 + + us.codecraft + webmagic-scripts + 0.4.1-SNAPSHOT + + + + org.jruby + jruby + 1.7.6 + + + junit + junit + test + + + us.codecraft + webmagic-core + ${project.version} + + + + + \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java new file mode 100644 index 00000000..b821ae48 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.processor; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + */ +public class ScriptProcessor implements PageProcessor{ + + private ScriptEngine rubyEngine; + + private String defines; + + ScriptProcessor(){ + ScriptEngineManager manager = new ScriptEngineManager(); + rubyEngine = manager.getEngineByName("jruby"); + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb"); + try { + defines = IOUtils.toString(resourceAsStream); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public void process(Page page) { + ScriptContext context = rubyEngine.getContext(); + context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); + String script; + try { + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb"); + try { + script = IOUtils.toString(resourceAsStream); + rubyEngine.eval(defines+script, context); + } catch (IOException e) { + e.printStackTrace(); + } + } catch (ScriptException e) { + e.printStackTrace(); + } + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} diff --git a/webmagic-scripts/src/main/resources/ruby/defines.rb b/webmagic-scripts/src/main/resources/ruby/defines.rb new file mode 100644 index 00000000..6d3cbd86 --- /dev/null +++ b/webmagic-scripts/src/main/resources/ruby/defines.rb @@ -0,0 +1,11 @@ +def xpath str + $page.getHtml().xpath(str).toString() +end +def css str + $page.getHtml().css(str).toString() +end +def urls str + links = $page.getHtml().links().regex(str).all(); + $page.addTargetRequests(links); +end + diff --git a/webmagic-scripts/src/main/resources/ruby/oschina.rb b/webmagic-scripts/src/main/resources/ruby/oschina.rb new file mode 100644 index 00000000..225f8224 --- /dev/null +++ b/webmagic-scripts/src/main/resources/ruby/oschina.rb @@ -0,0 +1,5 @@ +title = css "div.BlogTitle h1" +content = css "div.BlogContent" +urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" +puts title +puts content \ No newline at end of file diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java new file mode 100644 index 00000000..c2965171 --- /dev/null +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/jruby/TestJRubyCall.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.jruby; + +import org.junit.Test; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; + +/** + * @author code4crafter@gmail.com + */ +public class TestJRubyCall { + + @Test + public void test() throws ScriptException { + ScriptEngineManager manager = new ScriptEngineManager(); + ScriptEngine rubyEngine = manager.getEngineByName("jruby"); + ScriptContext context = rubyEngine.getContext(); + + context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE); +// rubyEngine.eval("", context); + rubyEngine.eval("b=1; puts b", context); + } +}