Merge branch 'master' of github.com:code4craft/webmagic

Conflicts:
	README.md
	pom.xml
	webmagic-samples/pom.xml
	webmagic-selenium/pom.xml
pull/358/head
yihua.huang 11 years ago
commit fd23cb6dc0

@ -1,6 +1,6 @@
![http://static.oschina.net/uploads/space/2013/1110/200709_oP1e_190591.jpg](http://static.oschina.net/uploads/space/2013/1110/200709_oP1e_190591.jpg)
---------
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg)
[Readme in English](https://github.com/code4craft/webmagic/tree/master/en_docs)
@ -38,12 +38,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.0</version>
<version>0.4.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.0</version>
<version>0.4.1</version>
</dependency>
#### 项目结构
@ -152,3 +152,6 @@ webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
### Mail-list:
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

@ -0,0 +1,351 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>ActiveLayerIndex</key>
<integer>0</integer>
<key>ApplicationVersion</key>
<array>
<string>com.omnigroup.OmniGrafflePro</string>
<string>139.16.0.171715</string>
</array>
<key>AutoAdjust</key>
<false/>
<key>BackgroundGraphic</key>
<dict>
<key>Bounds</key>
<string>{{0, 0}, {48, 48}}</string>
<key>Class</key>
<string>SolidGraphic</string>
<key>ID</key>
<integer>2</integer>
<key>Style</key>
<dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<key>BaseZoom</key>
<integer>0</integer>
<key>CanvasOrigin</key>
<string>{0, 0}</string>
<key>CanvasSize</key>
<string>{48, 48}</string>
<key>ColumnAlign</key>
<integer>1</integer>
<key>ColumnSpacing</key>
<real>36</real>
<key>CreationDate</key>
<string>2013-11-10 06:17:01 +0000</string>
<key>Creator</key>
<string>黄 亿华</string>
<key>DisplayScale</key>
<string>1 pt = 1 pt</string>
<key>GraphDocumentVersion</key>
<integer>8</integer>
<key>GraphicsList</key>
<array>
<dict>
<key>Bounds</key>
<string>{{7.5, 24}, {23, 15}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>ID</key>
<integer>45</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\fs16 \cf0 Magi
\fs24 c}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Bounds</key>
<string>{{18, 13}, {19.359630584716797, 18}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>Vertical</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Color</key>
<dict>
<key>w</key>
<string>0</string>
</dict>
<key>Font</key>
<string>STHeitiSC-Light</string>
<key>Size</key>
<real>6</real>
</dict>
<key>ID</key>
<integer>39</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fmodern\fcharset0 Courier-Oblique;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\fs14 \cf0 eb}</string>
</dict>
</dict>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>ID</key>
<integer>31</integer>
<key>Points</key>
<array>
<string>{6, 11}</string>
<string>{15, 27}</string>
<string>{14, 8}</string>
<string>{21, 26}</string>
<string>{22, 6}</string>
<string>{22, 6}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
</dict>
</dict>
</dict>
</array>
<key>GridInfo</key>
<dict>
<key>GridSpacing</key>
<real>1</real>
<key>ShowsGrid</key>
<string>YES</string>
<key>SnapsToGrid</key>
<string>YES</string>
</dict>
<key>GuidesLocked</key>
<string>NO</string>
<key>GuidesVisible</key>
<string>YES</string>
<key>HPages</key>
<integer>1</integer>
<key>ImageCounter</key>
<integer>2</integer>
<key>KeepToScale</key>
<false/>
<key>Layers</key>
<array>
<dict>
<key>Lock</key>
<string>NO</string>
<key>Name</key>
<string>图层 1</string>
<key>Print</key>
<string>YES</string>
<key>View</key>
<string>YES</string>
</dict>
</array>
<key>LayoutInfo</key>
<dict>
<key>Animate</key>
<string>NO</string>
<key>circoMinDist</key>
<real>18</real>
<key>circoSeparation</key>
<real>0.0</real>
<key>layoutEngine</key>
<string>dot</string>
<key>neatoSeparation</key>
<real>0.0</real>
<key>twopiSeparation</key>
<real>0.0</real>
</dict>
<key>LinksVisible</key>
<string>NO</string>
<key>MagnetsVisible</key>
<string>NO</string>
<key>MasterSheets</key>
<array/>
<key>ModificationDate</key>
<string>2013-11-10 06:51:47 +0000</string>
<key>Modifier</key>
<string>黄 亿华</string>
<key>NotesVisible</key>
<string>NO</string>
<key>Orientation</key>
<integer>2</integer>
<key>OriginVisible</key>
<string>NO</string>
<key>PageBreaks</key>
<string>YES</string>
<key>PrintInfo</key>
<dict>
<key>NSBottomMargin</key>
<array>
<string>float</string>
<string>41</string>
</array>
<key>NSHorizonalPagination</key>
<array>
<string>coded</string>
<string>BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG</string>
</array>
<key>NSLeftMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSPaperSize</key>
<array>
<string>size</string>
<string>{594.99997329711914, 842}</string>
</array>
<key>NSPrintReverseOrientation</key>
<array>
<string>int</string>
<string>0</string>
</array>
<key>NSRightMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSTopMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
</dict>
<key>PrintOnePage</key>
<false/>
<key>ReadOnly</key>
<string>NO</string>
<key>RowAlign</key>
<integer>1</integer>
<key>RowSpacing</key>
<real>36</real>
<key>SheetTitle</key>
<string>版面 1</string>
<key>SmartAlignmentGuidesActive</key>
<string>NO</string>
<key>SmartDistanceGuidesActive</key>
<string>NO</string>
<key>UniqueID</key>
<integer>1</integer>
<key>UseEntirePage</key>
<false/>
<key>VPages</key>
<integer>1</integer>
<key>WindowInfo</key>
<dict>
<key>CurrentSheet</key>
<integer>0</integer>
<key>ExpandedCanvases</key>
<array/>
<key>Frame</key>
<string>{{491, 381}, {498, 477}}</string>
<key>ListView</key>
<true/>
<key>OutlineWidth</key>
<integer>142</integer>
<key>RightSidebar</key>
<false/>
<key>Sidebar</key>
<true/>
<key>SidebarWidth</key>
<integer>116</integer>
<key>VisibleRegion</key>
<string>{{0.125, 0.125}, {47.75, 47.875}}</string>
<key>Zoom</key>
<real>8</real>
<key>ZoomValues</key>
<array>
<array>
<string>版面 1</string>
<real>8</real>
<real>1</real>
</array>
</array>
</dict>
</dict>
</plist>

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

@ -0,0 +1,552 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>ActiveLayerIndex</key>
<integer>0</integer>
<key>ApplicationVersion</key>
<array>
<string>com.omnigroup.OmniGrafflePro</string>
<string>139.16.0.171715</string>
</array>
<key>AutoAdjust</key>
<true/>
<key>BackgroundGraphic</key>
<dict>
<key>Bounds</key>
<string>{{0, 0}, {1117.9999465942383, 783}}</string>
<key>Class</key>
<string>SolidGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>37</real>
</dict>
<key>ID</key>
<integer>2</integer>
<key>Style</key>
<dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<key>BaseZoom</key>
<integer>0</integer>
<key>CanvasOrigin</key>
<string>{0, 0}</string>
<key>ColumnAlign</key>
<integer>1</integer>
<key>ColumnSpacing</key>
<real>36</real>
<key>CreationDate</key>
<string>2013-11-10 06:51:58 +0000</string>
<key>Creator</key>
<string>黄 亿华</string>
<key>DisplayScale</key>
<string>1 0/72 in = 1 0/72 in</string>
<key>GraphDocumentVersion</key>
<integer>8</integer>
<key>GraphicsList</key>
<array>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>Head</key>
<dict>
<key>ID</key>
<integer>60</integer>
<key>Position</key>
<real>0.40939974784851074</real>
</dict>
<key>ID</key>
<integer>62</integer>
<key>Points</key>
<array>
<string>{324, 109}</string>
<string>{339.36559006029825, 179.11528294284673}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
<key>Width</key>
<real>10</real>
</dict>
</dict>
<key>Tail</key>
<dict>
<key>ID</key>
<integer>59</integer>
<key>Info</key>
<integer>4</integer>
</dict>
</dict>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>Head</key>
<dict>
<key>ID</key>
<integer>60</integer>
<key>Position</key>
<real>0.73653632402420044</real>
</dict>
<key>ID</key>
<integer>61</integer>
<key>Points</key>
<array>
<string>{269, 146}</string>
<string>{296, 194}</string>
<string>{309, 266}</string>
<string>{349, 265}</string>
<string>{348.96211936963607, 215.03741157007715}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
<key>Width</key>
<real>10</real>
</dict>
</dict>
<key>Tail</key>
<dict>
<key>ID</key>
<integer>59</integer>
</dict>
</dict>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>ID</key>
<integer>60</integer>
<key>Points</key>
<array>
<string>{371.89694213867188, 179}</string>
<string>{356.89694213867188, 162}</string>
<string>{335.89694213867188, 188}</string>
<string>{351.89694213867188, 217}</string>
<string>{371.89694213867188, 202}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
<key>Width</key>
<real>10</real>
</dict>
</dict>
</dict>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>ID</key>
<integer>59</integer>
<key>Points</key>
<array>
<string>{269, 146}</string>
<string>{295, 189}</string>
<string>{300, 110}</string>
<string>{310, 178}</string>
<string>{324, 109}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
<key>Width</key>
<real>10</real>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{335.89695436197019, 119}, {41, 43}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>ID</key>
<integer>47</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\fs72 \cf0 eb}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Bounds</key>
<string>{{164, 154}, {236.89692325714185, 98.181818181818088}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>45</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\fs96 \cf0 Magi}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Class</key>
<string>LineGraphic</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica</string>
<key>Size</key>
<real>13</real>
</dict>
<key>ID</key>
<integer>31</integer>
<key>Points</key>
<array>
<string>{50.404270172119141, 72.000000000000256}</string>
<string>{115.40427017211914, 154.00000000000028}</string>
<string>{103.80320000069037, 26.090909090909292}</string>
<string>{124.95447158813477, 97}</string>
<string>{175.90226360069005, 143.90909090909116}</string>
<string>{186.20212982926148, 13}</string>
<string>{186.20212982926148, 13}</string>
</array>
<key>Style</key>
<dict>
<key>stroke</key>
<dict>
<key>HeadArrow</key>
<string>0</string>
<key>Legacy</key>
<true/>
<key>LineType</key>
<integer>1</integer>
<key>TailArrow</key>
<string>0</string>
<key>Width</key>
<real>10</real>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{406.79786682128906, 136.09091186523438}, {165, 160}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>46</integer>
<key>ImageID</key>
<integer>1</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
</array>
<key>GridInfo</key>
<dict/>
<key>GuidesLocked</key>
<string>NO</string>
<key>GuidesVisible</key>
<string>YES</string>
<key>HPages</key>
<integer>2</integer>
<key>ImageCounter</key>
<integer>2</integer>
<key>ImageLinkBack</key>
<array>
<dict/>
</array>
<key>ImageList</key>
<array>
<string>image1.tiff</string>
</array>
<key>KeepToScale</key>
<false/>
<key>Layers</key>
<array>
<dict>
<key>Lock</key>
<string>NO</string>
<key>Name</key>
<string>图层 1</string>
<key>Print</key>
<string>YES</string>
<key>View</key>
<string>YES</string>
</dict>
</array>
<key>LayoutInfo</key>
<dict>
<key>Animate</key>
<string>NO</string>
<key>circoMinDist</key>
<real>18</real>
<key>circoSeparation</key>
<real>0.0</real>
<key>layoutEngine</key>
<string>dot</string>
<key>neatoSeparation</key>
<real>0.0</real>
<key>twopiSeparation</key>
<real>0.0</real>
</dict>
<key>LinksVisible</key>
<string>NO</string>
<key>MagnetsVisible</key>
<string>NO</string>
<key>MasterSheets</key>
<array/>
<key>ModificationDate</key>
<string>2013-11-10 07:00:00 +0000</string>
<key>Modifier</key>
<string>黄 亿华</string>
<key>NotesVisible</key>
<string>NO</string>
<key>Orientation</key>
<integer>2</integer>
<key>OriginVisible</key>
<string>NO</string>
<key>PageBreaks</key>
<string>YES</string>
<key>PrintInfo</key>
<dict>
<key>NSBottomMargin</key>
<array>
<string>float</string>
<string>41</string>
</array>
<key>NSHorizonalPagination</key>
<array>
<string>coded</string>
<string>BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG</string>
</array>
<key>NSLeftMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSPaperSize</key>
<array>
<string>size</string>
<string>{594.99997329711914, 842}</string>
</array>
<key>NSPrintReverseOrientation</key>
<array>
<string>int</string>
<string>0</string>
</array>
<key>NSRightMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSTopMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
</dict>
<key>PrintOnePage</key>
<false/>
<key>ReadOnly</key>
<string>NO</string>
<key>RowAlign</key>
<integer>1</integer>
<key>RowSpacing</key>
<real>36</real>
<key>SheetTitle</key>
<string>版面 1</string>
<key>SmartAlignmentGuidesActive</key>
<string>YES</string>
<key>SmartDistanceGuidesActive</key>
<string>YES</string>
<key>UniqueID</key>
<integer>1</integer>
<key>UseEntirePage</key>
<false/>
<key>VPages</key>
<integer>1</integer>
<key>WindowInfo</key>
<dict>
<key>CurrentSheet</key>
<integer>0</integer>
<key>ExpandedCanvases</key>
<array/>
<key>Frame</key>
<string>{{350, -208}, {693, 795}}</string>
<key>ListView</key>
<true/>
<key>OutlineWidth</key>
<integer>142</integer>
<key>RightSidebar</key>
<false/>
<key>ShowRuler</key>
<true/>
<key>Sidebar</key>
<true/>
<key>SidebarWidth</key>
<integer>120</integer>
<key>VisibleRegion</key>
<string>{{23, 0}, {558, 656}}</string>
<key>Zoom</key>
<real>1</real>
<key>ZoomValues</key>
<array>
<array>
<string>版面 1</string>
<real>1</real>
<real>1</real>
</array>
</array>
</dict>
</dict>
</plist>

Binary file not shown.

@ -0,0 +1,840 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>ApplicationVersion</key>
<array>
<string>com.omnigroup.OmniGrafflePro</string>
<string>139.16.0.171715</string>
</array>
<key>CreationDate</key>
<string>2013-11-10 07:01:04 +0000</string>
<key>Creator</key>
<string>黄 亿华</string>
<key>GraphDocumentVersion</key>
<integer>8</integer>
<key>GuidesLocked</key>
<string>NO</string>
<key>GuidesVisible</key>
<string>YES</string>
<key>ImageCounter</key>
<integer>6</integer>
<key>ImageLinkBack</key>
<array>
<dict/>
<dict/>
<dict/>
<dict/>
</array>
<key>ImageList</key>
<array>
<string>image5.tiff</string>
<string>image4.tiff</string>
<string>image2.tiff</string>
<string>image1.tiff</string>
</array>
<key>LinksVisible</key>
<string>NO</string>
<key>MagnetsVisible</key>
<string>NO</string>
<key>MasterSheets</key>
<array/>
<key>ModificationDate</key>
<string>2013-11-10 08:09:16 +0000</string>
<key>Modifier</key>
<string>黄 亿华</string>
<key>NotesVisible</key>
<string>NO</string>
<key>OriginVisible</key>
<string>NO</string>
<key>PageBreaks</key>
<string>YES</string>
<key>PrintInfo</key>
<dict>
<key>NSBottomMargin</key>
<array>
<string>float</string>
<string>41</string>
</array>
<key>NSHorizonalPagination</key>
<array>
<string>coded</string>
<string>BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG</string>
</array>
<key>NSLeftMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSPaperSize</key>
<array>
<string>size</string>
<string>{594.99997329711914, 842}</string>
</array>
<key>NSPrintReverseOrientation</key>
<array>
<string>int</string>
<string>0</string>
</array>
<key>NSRightMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
<key>NSTopMargin</key>
<array>
<string>float</string>
<string>18</string>
</array>
</dict>
<key>ReadOnly</key>
<string>NO</string>
<key>Sheets</key>
<array>
<dict>
<key>ActiveLayerIndex</key>
<integer>0</integer>
<key>AutoAdjust</key>
<true/>
<key>BackgroundGraphic</key>
<dict>
<key>Bounds</key>
<string>{{0, 0}, {558.99997329711914, 783}}</string>
<key>Class</key>
<string>SolidGraphic</string>
<key>ID</key>
<integer>2</integer>
<key>Style</key>
<dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<key>BaseZoom</key>
<integer>0</integer>
<key>CanvasOrigin</key>
<string>{0, 0}</string>
<key>ColumnAlign</key>
<integer>1</integer>
<key>ColumnSpacing</key>
<real>36</real>
<key>DisplayScale</key>
<string>1 0/72 in = 1 0/72 in</string>
<key>GraphicsList</key>
<array>
<dict>
<key>Bounds</key>
<string>{{390, 391.5}, {114, 90}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>7</integer>
<key>ImageID</key>
<integer>2</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>FillType</key>
<integer>2</integer>
<key>GradientAngle</key>
<real>90</real>
<key>GradientColor</key>
<dict>
<key>w</key>
<string>0.666667</string>
</dict>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{3, 265}, {181, 114}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>LucidaSans-DemiItalic</string>
<key>Size</key>
<real>96</real>
</dict>
<key>ID</key>
<integer>6</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\b\fs192 \cf1 M }</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Bounds</key>
<string>{{168, 314}, {77, 58}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>LucidaSans-DemiItalic</string>
<key>Size</key>
<real>48</real>
</dict>
<key>ID</key>
<integer>5</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;}
{\colortbl;\red255\green255\blue255;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\b\fs96 \cf2 agi}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Bounds</key>
<string>{{356, 201}, {86, 86}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>LucidaBright-DemiItalic</string>
<key>Size</key>
<real>72</real>
</dict>
<key>ID</key>
<integer>4</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaBright-Demi;}
{\colortbl;\red255\green255\blue255;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\b\fs144 \cf2 eb}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
<dict>
<key>Bounds</key>
<string>{{43, 114}, {395, 400}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>Clip</string>
<key>Flow</key>
<string>Clip</string>
<key>HFlip</key>
<string>YES</string>
<key>ID</key>
<integer>3</integer>
<key>ImageID</key>
<integer>1</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{-4, 114}, {535, 400}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>1</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Color</key>
<dict>
<key>b</key>
<string>0</string>
<key>g</key>
<string>0</string>
<key>r</key>
<string>0</string>
</dict>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
</array>
<key>GridInfo</key>
<dict/>
<key>HPages</key>
<integer>1</integer>
<key>KeepToScale</key>
<false/>
<key>Layers</key>
<array>
<dict>
<key>Lock</key>
<string>NO</string>
<key>Name</key>
<string>图层 1</string>
<key>Print</key>
<string>YES</string>
<key>View</key>
<string>YES</string>
</dict>
</array>
<key>LayoutInfo</key>
<dict>
<key>Animate</key>
<string>NO</string>
<key>circoMinDist</key>
<real>18</real>
<key>circoSeparation</key>
<real>0.0</real>
<key>layoutEngine</key>
<string>dot</string>
<key>neatoSeparation</key>
<real>0.0</real>
<key>twopiSeparation</key>
<real>0.0</real>
</dict>
<key>Orientation</key>
<integer>2</integer>
<key>PrintOnePage</key>
<false/>
<key>RowAlign</key>
<integer>1</integer>
<key>RowSpacing</key>
<real>36</real>
<key>SheetTitle</key>
<string>版面 1</string>
<key>UniqueID</key>
<integer>1</integer>
<key>VPages</key>
<integer>1</integer>
</dict>
<dict>
<key>ActiveLayerIndex</key>
<integer>0</integer>
<key>AutoAdjust</key>
<true/>
<key>BackgroundGraphic</key>
<dict>
<key>Bounds</key>
<string>{{0, 0}, {558.99997329711914, 783}}</string>
<key>Class</key>
<string>SolidGraphic</string>
<key>ID</key>
<integer>2</integer>
<key>Style</key>
<dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<key>BaseZoom</key>
<integer>0</integer>
<key>CanvasOrigin</key>
<string>{0, 0}</string>
<key>ColumnAlign</key>
<integer>1</integer>
<key>ColumnSpacing</key>
<real>36</real>
<key>DisplayScale</key>
<string>1 0/72 in = 1.0000 in</string>
<key>GraphicsList</key>
<array>
<dict>
<key>Bounds</key>
<string>{{232, 432}, {84, 93}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>10</integer>
<key>ImageID</key>
<integer>4</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{16, 421}, {500, 115}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica-Bold</string>
<key>Size</key>
<real>96</real>
</dict>
<key>ID</key>
<integer>8</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\b\fs192 \cf2 Web agic}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
</array>
<key>GridInfo</key>
<dict/>
<key>HPages</key>
<integer>1</integer>
<key>KeepToScale</key>
<false/>
<key>Layers</key>
<array>
<dict>
<key>Lock</key>
<string>NO</string>
<key>Name</key>
<string>图层 1</string>
<key>Print</key>
<string>YES</string>
<key>View</key>
<string>YES</string>
</dict>
</array>
<key>LayoutInfo</key>
<dict>
<key>Animate</key>
<string>NO</string>
<key>circoMinDist</key>
<real>18</real>
<key>circoSeparation</key>
<real>0.0</real>
<key>layoutEngine</key>
<string>dot</string>
<key>neatoSeparation</key>
<real>0.0</real>
<key>twopiSeparation</key>
<real>0.0</real>
</dict>
<key>Orientation</key>
<integer>2</integer>
<key>PrintOnePage</key>
<false/>
<key>RowAlign</key>
<integer>1</integer>
<key>RowSpacing</key>
<real>36</real>
<key>SheetTitle</key>
<string>版面 2</string>
<key>UniqueID</key>
<integer>2</integer>
<key>VPages</key>
<integer>1</integer>
</dict>
<dict>
<key>ActiveLayerIndex</key>
<integer>0</integer>
<key>AutoAdjust</key>
<true/>
<key>BackgroundGraphic</key>
<dict>
<key>Bounds</key>
<string>{{0, 0}, {1117.9999465942383, 783}}</string>
<key>Class</key>
<string>SolidGraphic</string>
<key>ID</key>
<integer>2</integer>
<key>Style</key>
<dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<key>BaseZoom</key>
<integer>0</integer>
<key>CanvasOrigin</key>
<string>{0, 0}</string>
<key>ColumnAlign</key>
<integer>1</integer>
<key>ColumnSpacing</key>
<real>36</real>
<key>DisplayScale</key>
<string>1 0/72 in = 1.0000 in</string>
<key>GraphicsList</key>
<array>
<dict>
<key>Bounds</key>
<string>{{9, 277.5}, {114, 114}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>ID</key>
<integer>11</integer>
<key>ImageID</key>
<integer>5</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>shadow</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
</dict>
<dict>
<key>Bounds</key>
<string>{{100, 294}, {474, 115}}</string>
<key>Class</key>
<string>ShapedGraphic</string>
<key>FitText</key>
<string>YES</string>
<key>Flow</key>
<string>Resize</string>
<key>FontInfo</key>
<dict>
<key>Font</key>
<string>Helvetica-Bold</string>
<key>Size</key>
<real>96</real>
</dict>
<key>ID</key>
<integer>8</integer>
<key>Shape</key>
<string>Rectangle</string>
<key>Style</key>
<dict>
<key>fill</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
<key>stroke</key>
<dict>
<key>Draws</key>
<string>NO</string>
</dict>
</dict>
<key>Text</key>
<dict>
<key>Pad</key>
<integer>0</integer>
<key>Text</key>
<string>{\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
\f0\i\b\fs192 \cf0 WebMagic}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
<key>Wrap</key>
<string>NO</string>
</dict>
</array>
<key>GridInfo</key>
<dict/>
<key>HPages</key>
<integer>2</integer>
<key>KeepToScale</key>
<false/>
<key>Layers</key>
<array>
<dict>
<key>Lock</key>
<string>NO</string>
<key>Name</key>
<string>图层 1</string>
<key>Print</key>
<string>YES</string>
<key>View</key>
<string>YES</string>
</dict>
</array>
<key>LayoutInfo</key>
<dict>
<key>Animate</key>
<string>NO</string>
<key>circoMinDist</key>
<real>18</real>
<key>circoSeparation</key>
<real>0.0</real>
<key>layoutEngine</key>
<string>dot</string>
<key>neatoSeparation</key>
<real>0.0</real>
<key>twopiSeparation</key>
<real>0.0</real>
</dict>
<key>Orientation</key>
<integer>2</integer>
<key>PrintOnePage</key>
<false/>
<key>RowAlign</key>
<integer>1</integer>
<key>RowSpacing</key>
<real>36</real>
<key>SheetTitle</key>
<string>版面 3</string>
<key>UniqueID</key>
<integer>3</integer>
<key>VPages</key>
<integer>1</integer>
</dict>
</array>
<key>SmartAlignmentGuidesActive</key>
<string>YES</string>
<key>SmartDistanceGuidesActive</key>
<string>YES</string>
<key>UseEntirePage</key>
<false/>
<key>WindowInfo</key>
<dict>
<key>CurrentSheet</key>
<integer>2</integer>
<key>ExpandedCanvases</key>
<array/>
<key>Frame</key>
<string>{{174, 77}, {771, 795}}</string>
<key>ListView</key>
<true/>
<key>OutlineWidth</key>
<integer>142</integer>
<key>RightSidebar</key>
<false/>
<key>ShowRuler</key>
<true/>
<key>Sidebar</key>
<true/>
<key>SidebarWidth</key>
<integer>120</integer>
<key>VisibleRegion</key>
<string>{{0, 0}, {636, 656}}</string>
<key>Zoom</key>
<real>1</real>
<key>ZoomValues</key>
<array>
<array>
<string>版面 1</string>
<real>1</real>
<real>1</real>
</array>
<array>
<string>版面 2</string>
<real>1</real>
<real>1</real>
</array>
<array>
<string>版面 3</string>
<real>1</real>
<real>1</real>
</array>
</array>
</dict>
</dict>
</plist>

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.4.0</version>
<version>0.4.2-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
@ -36,7 +36,7 @@
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
<url>git@github.com:code4craft/webmagic.git</url>
<tag>webmagic-0.4.0</tag>
<tag>HEAD</tag>
</scm>
<licenses>
<license>
@ -48,7 +48,7 @@
<modules>
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-samples/</module>
<module>webmagic-scripts/</module>
</modules>
<dependencyManagement>

@ -1,6 +1,6 @@
Release Notes
----
See old versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases)
See latest versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases)
*2012-9-4* `version0.3.0`

@ -0,0 +1,30 @@
WebMagic-Avalon项目计划
=======
WebMagic-Avalon项目的目标是打造一个可配置、可管理的爬虫以及一个可分享配置/脚本的平台,从而减少熟悉的开发者的开发量,并且让**不熟悉Java技术的人**也能简单的使用一个爬虫。
## Part1:webmagic-scripts
目标:使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。
例如我需要抓github的仓库数据可以这样写一个脚本(javascript)
[https://github.com/code4craft/webmagic/tree/master/webmagic-scripts](https://github.com/code4craft/webmagic/tree/master/webmagic-scripts)
这个功能目前实现了一部分,但最终结果仍在实验阶段。欢迎大家积极参与并提出意见。
## Part2:webmagic-pannel
一个集成了加载脚本、管理爬虫的后台。计划中。
## Part3:webmagic-market
一个可以分享、搜索和下载脚本的站点。计划中。
## 如何参与
webmagic目前由作者业余维护仅仅为了分享和个人提高没有任何盈利也没有商业化打算。
欢迎以下几种形式的贡献:
1. 为webmagic项目本身提出改进意见可以通过邮件组、qq、oschina或者在github提交issue(推荐)的方式。
2. 参与WebMagic-Avalon计划的建设讨论包括产品设计、技术选型等可以直接回复这个issue。
3. 参与webmagic代码开发请fork一份代码修改后提交pull request给我。请使用尽量新的版本并说明修改内容。pull request接受后我会将你加为committer共同参与开发。

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.0</version>
<version>0.4.2-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -9,8 +9,8 @@ import java.util.ArrayList;
import java.util.List;
/**
*
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
* Main method <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
@ -19,9 +19,9 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Page {
@ -31,8 +31,12 @@ public class Page {
private Html html;
private String rawText;
private Selectable url;
private int statusCode;
private List<Request> targetRequests = new ArrayList<Request>();
public Page() {
@ -60,9 +64,17 @@ public class Page {
* @return html
*/
public Html getHtml() {
if (html == null) {
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
}
return html;
}
/**
* @param html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
public void setHtml(Html html) {
this.html = html;
}
@ -93,7 +105,7 @@ public class Page {
*
* @param requests
*/
public void addTargetRequests(List<String> requests,long priority) {
public void addTargetRequests(List<String> requests, long priority) {
synchronized (targetRequests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
@ -162,13 +174,31 @@ public class Page {
return resultItems;
}
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public String getRawText() {
return rawText;
}
public Page setRawText(String rawText) {
this.rawText = rawText;
return this;
}
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
", html=" + html +
", rawText='" + rawText + '\'' +
", url=" + url +
", statusCode=" + statusCode +
", targetRequests=" + targetRequests +
'}';
}

@ -6,9 +6,9 @@ import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
@ -18,12 +18,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.UUID;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
@ -98,6 +96,10 @@ public class Spider implements Runnable, Task {
private Condition newUrlCondition = newUrlLock.newCondition();
private final AtomicInteger threadAlive = new AtomicInteger(0);
private final AtomicLong pageCount = new AtomicLong(0);
/**
* create a spider with pageProcessor.
*
@ -189,7 +191,7 @@ public class Spider implements Runnable, Task {
*
* @param pipeline
* @return this
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public Spider pipeline(Pipeline pipeline) {
@ -210,6 +212,20 @@ public class Spider implements Runnable, Task {
return this;
}
/**
* set pipelines for Spider
*
* @param pipeline
* @return this
* @see Pipeline
* @since 0.4.1
*/
public Spider setPipelines(List<Pipeline> pipelines) {
checkIfRunning();
this.pipelines = pipelines;
return this;
}
/**
* clear the pipelines set
*
@ -262,6 +278,7 @@ public class Spider implements Runnable, Task {
}
startRequests.clear();
}
threadAlive.set(0);
}
@Override
@ -269,7 +286,6 @@ public class Spider implements Runnable, Task {
checkRunningStat();
initComponent();
logger.info("Spider " + getUUID() + " started!");
final AtomicInteger threadAlive = new AtomicInteger(0);
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
if (request == null) {
@ -290,6 +306,7 @@ public class Spider implements Runnable, Task {
logger.error("download " + requestFinal + " error", e);
} finally {
threadAlive.decrementAndGet();
pageCount.incrementAndGet();
signalNewUrl();
}
}
@ -355,7 +372,7 @@ public class Spider implements Runnable, Task {
return;
}
// for cycle retry
if (page.getHtml() == null) {
if (page.getRawText() == null) {
extractAndAddRequests(page);
sleep(site.getSleepTime());
return;
@ -471,6 +488,10 @@ public class Spider implements Runnable, Task {
private void waitNewUrl() {
try {
newUrlLock.lock();
//double check
if (threadAlive.get() == 0 && exitWhenComplete) {
return;
}
try {
newUrlCondition.await();
} catch (InterruptedException e) {
@ -546,6 +567,61 @@ public class Spider implements Runnable, Task {
return spawnUrl;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public long getPageCount() {
return pageCount.get();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
public Status getStatus(){
return Status.fromValue(stat.get());
}
public enum Status {
Init(0), Running(1), Stopped(2);
private Status(int value) {
this.value = value;
}
private int value;
int getValue() {
return value;
}
public static Status fromValue(int value) {
for (Status status : Status.values()) {
if (status.getValue() == value) {
return status;
}
}
//default value
return Init;
}
}
/**
* Get thread count which is running
* @return thread count which is running
* @since 0.4.1
*/
public int getThreadAlive() {
return threadAlive.get();
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>

@ -162,9 +162,10 @@ public class HttpClientDownloader implements Downloader {
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}

@ -23,7 +23,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
//skip this page
page.setSkip(true);
}
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("content", page.getHtml().smartContent().toString());
page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}

@ -23,10 +23,10 @@ public interface Scheduler {
public void push(Request request, Task task);
/**
*
* get an url to crawl
*
* @param task SchedulerTask
* @return
* @param task the task of spider
* @return the url to crawl
*/
public Request poll(Task task);

@ -9,7 +9,7 @@ import java.util.ArrayList;
import java.util.List;
/**
* Selectable plain text.<br>
* Selectable html.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
@ -23,16 +23,28 @@ public class Html extends PlainText {
*/
private Document document;
private boolean init = false;
public Html(List<String> strings) {
super(strings);
}
public Html(String text) {
super(text);
try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
logger.warn("parse document error ", e);
}
/**
* lazy init
*/
private void initDocument() {
if (this.document == null && !init) {
init = true;
//just init once whether the parsing succeeds or not
try {
this.document = Jsoup.parse(getText());
} catch (Exception e) {
logger.warn("parse document error ", e);
}
}
}
@ -47,6 +59,7 @@ public class Html extends PlainText {
@Override
protected Selectable select(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
@ -59,6 +72,7 @@ public class Html extends PlainText {
@Override
protected Selectable selectList(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
List<String> result = selector.selectList(string);
@ -69,6 +83,7 @@ public class Html extends PlainText {
@Override
public Selectable smartContent() {
initDocument();
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings);
}

@ -45,6 +45,16 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException();
}
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException();

@ -35,6 +35,23 @@ public interface Selectable {
*/
public Selectable $(String selector, String attrName);
/**
* select list with css selector
*
* @param selector css selector expression
* @return new Selectable after extract
*/
public Selectable css(String selector);
/**
* select list with css selector
*
* @param selector css selector expression
* @param attrName attribute name of css selector
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);
/**
* select smart content with ReadAbility algorithm
*

@ -1,100 +1,87 @@
package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import us.codecraft.webmagic.utils.Experimental;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Extract the text content of html.<br>
* Using Readability algorithm: find parents of all p tags.
* Borrowed from https://code.google.com/p/cx-extractor/
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @since 0.4.1
*
*/
@Experimental
public class SmartContentSelector implements Selector {
private Logger logger = Logger.getLogger(getClass());
public SmartContentSelector() {
}
@Override
public String select(String text) {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
if (tagNode == null) {
return null;
}
TagNode[] nodes = tagNode.getElementsByName("p", true);
TagNode[] pres = tagNode.getElementsByName("pre", true);
Map<TagNode, Double> pDensityCountMap = new HashMap<TagNode, Double>();
countPdensity(nodes, pDensityCountMap);
countPdensity(pres, pDensityCountMap);
for (TagNode pre : pres) {
addCounter(pre, pDensityCountMap, 2);
}
List<Map.Entry<TagNode, Double>> sortList = new ArrayList<Map.Entry<TagNode, Double>>();
if (pDensityCountMap.size() == 0) {
return null;
}
for (Map.Entry<TagNode, Double> entry : pDensityCountMap.entrySet()) {
// if (logger.isDebugEnabled()) {
// logger.debug("p\t" + entry.getKey().getName() + "#" + entry.getKey().getAttributeByName("id") +
// "@" + entry.getKey().getAttributeByName("class") + ":" + entry.getValue());
// }
sortList.add(entry);
}
public String select(String html) {
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment
html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css
html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char
html = html.replaceAll("(?is)<.*?>", "");
List<String> lines;
int blocksWidth =3;
int threshold =86;
int start;
int end;
StringBuilder text = new StringBuilder();
ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
Collections.sort(sortList, new Comparator<Map.Entry<TagNode, Double>>() {
@Override
public int compare(Map.Entry<TagNode, Double> o1, Map.Entry<TagNode, Double> o2) {
Double d1 = o1.getValue();
Double d2 = o2.getValue();
return -d1.compareTo(d2);
}
});
TagNode contentNode = sortList.get(0).getKey();
if (logger.isDebugEnabled()) {
logger.debug("p\t" + contentNode.getName() + "#" + contentNode.getAttributeByName("id") +
"@" + contentNode.getAttributeByName("class"));
}
return htmlCleaner.getInnerHtml(contentNode);
}
lines = Arrays.asList(html.split("\n"));
private void addCounter(TagNode node, Map<TagNode, Double> countMap, double delta) {
Double counter = countMap.get(node);
if (counter == null) {
countMap.put(node, delta);
} else {
countMap.put(node, counter + delta);
for (int i = 0; i < lines.size() - blocksWidth; i++) {
int wordsNum = 0;
for (int j = i; j < i + blocksWidth; j++) {
lines.set(j, lines.get(j).replaceAll("\\s+", ""));
wordsNum += lines.get(j).length();
}
indexDistribution.add(wordsNum);
}
}
private static final double parentWeight = 0.7;
start = -1; end = -1;
boolean boolstart = false, boolend = false;
text.setLength(0);
private void countPdensity(TagNode[] nodes, Map<TagNode, Double> pDensityCountMap) {
for (TagNode node : nodes) {
if (node == null) {
continue;
for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && ! boolstart) {
if (indexDistribution.get(i+1).intValue() != 0
|| indexDistribution.get(i+2).intValue() != 0
|| indexDistribution.get(i+3).intValue() != 0) {
boolstart = true;
start = i;
continue;
}
}
TagNode parent = node.getParent();
double pDensity = 1;
while (parent != null) {
addCounter(parent, pDensityCountMap, pDensity);
parent = parent.getParent();
pDensity = pDensity * parentWeight;
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) {
end = i;
boolend = true;
}
}
StringBuilder tmp = new StringBuilder();
if (boolend) {
//System.out.println(start+1 + "\t\t" + end+1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) continue;
tmp.append(lines.get(ii) + "\n");
}
String str = tmp.toString();
//System.out.println(str);
if (str.contains("Copyright") ) continue;
text.append(str);
boolstart = boolend = false;
}
}
}
private TagNode findLowestCommonParent(List<TagNode> tagNodes, int maxMargin, Map<TagNode, AtomicInteger> countMap) {
TagNode contentNode = tagNodes.get(0);
return contentNode;
return text.toString();
}
@Override

@ -8,21 +8,11 @@
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />

@ -2,8 +2,14 @@ package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com
@ -26,4 +32,62 @@ public class SpiderTest {
spider.start();
Thread.sleep(10000);
}
@Ignore("long time")
@Test
public void testWaitAndNotify() throws InterruptedException {
for (int i = 0; i < 10000; i++) {
System.out.println("round" + i);
testRound();
}
}
private void testRound() {
Spider spider = Spider.create(new PageProcessor() {
private AtomicInteger count = new AtomicInteger();
@Override
public void process(Page page) {
page.setSkip(true);
}
@Override
public Site getSite() {
return Site.me().setSleepTime(0);
}
}).setDownloader(new Downloader() {
@Override
public Page download(Request request, Task task) {
return new Page().setRawText("");
}
@Override
public void setThread(int threadNum) {
}
}).setScheduler(new Scheduler() {
private AtomicInteger count = new AtomicInteger();
private Random random = new Random();
@Override
public void push(Request request, Task task) {
}
@Override
public synchronized Request poll(Task task) {
if (count.incrementAndGet() > 1000) {
return null;
}
if (random.nextInt(100)>90){
return null;
}
return new Request("test");
}
}).thread(10);
spider.run();
}
}

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.0</version>
<version>0.4.2-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -0,0 +1,36 @@
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.utils.Experimental;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
@Experimental
public class AppStore {
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName")
private String trackName;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description")
private String description;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount")
private int userRatingCount;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls",multi = true)
private List<String> screenshotUrls;
public static void main(String[] args) {
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
System.out.println(appStore.trackName);
System.out.println(appStore.description);
System.out.println(appStore.userRatingCount);
System.out.println(appStore.screenshotUrls);
}
}

@ -46,4 +46,12 @@ public class BaiduBaike{
}
ooSpider.close();
}
public String getName() {
return name;
}
public String getDescription() {
return description;
}
}

@ -0,0 +1,70 @@
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.1
*/
public class GithubRepoApi implements HasKey {
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name")
private String name;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login")
private String author;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true)
private List<String> language;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count")
private int star;
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.forks_count")
private int fork;
@ExtractByUrl
private String url;
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(100)
, new ConsolePageModelPipeline(), GithubRepoApi.class)
.addUrl("https://api.github.com/repos/code4craft/webmagic").run();
}
@Override
public String key() {
return author + ":" + name;
}
public String getName() {
return name;
}
public String getAuthor() {
return author;
}
public List<String> getLanguage() {
return language;
}
public String getUrl() {
return url;
}
public int getStar() {
return star;
}
public int getFork() {
return fork;
}
}

@ -31,8 +31,7 @@ public class OschinaBlog {
private Date date;
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(0)
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
}

@ -239,7 +239,7 @@ class PageModelExtractor {
} else {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
for (String s : list) {
Object o = processSingle(page, s, false);
if (o != null) {
@ -248,7 +248,7 @@ class PageModelExtractor {
}
return os;
} else {
String select = objectExtractor.getSelector().select(page.getHtml().toString());
String select = objectExtractor.getSelector().select(page.getRawText());
Object o = processSingle(page, select, false);
return o;
}

@ -24,7 +24,7 @@ public @interface ExtractBy {
/**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
public static enum Type {XPath, Regex, Css, JsonPath}
/**
* Extractor type, support XPath, CSS Selector and regex.

@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Define a extractor for url. Only regex can be used. <br>
* Define a extractor to extract data in url of current page. Only regex can be used. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0

@ -1,5 +1,6 @@
package us.codecraft.webmagic.scheduler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
@ -94,6 +95,9 @@ public class FileCacheQueueScheduler implements Scheduler {
urls = new LinkedHashSet<String>();
readCursorFile();
readUrlFile();
} catch (FileNotFoundException e) {
//init
logger.info("init cache file " + getFileName(fileUrlAllName));
} catch (IOException e) {
logger.error("init file error", e);
}
@ -101,23 +105,37 @@ public class FileCacheQueueScheduler implements Scheduler {
private void readUrlFile() throws IOException {
String line;
BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
urls.add(line.trim());
lineReaded++;
if (lineReaded > cursor.get()) {
queue.add(new Request(line));
BufferedReader fileUrlReader = null;
try {
fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
urls.add(line.trim());
lineReaded++;
if (lineReaded > cursor.get()) {
queue.add(new Request(line));
}
}
} finally {
if (fileUrlReader != null) {
IOUtils.closeQuietly(fileUrlReader);
}
}
}
private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
String line;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
cursor = new AtomicInteger(NumberUtils.toInt(line));
BufferedReader fileCursorReader = null;
try {
new BufferedReader(new FileReader(getFileName(fileCursor)));
String line;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
cursor = new AtomicInteger(NumberUtils.toInt(line));
}
} finally {
if (fileCursorReader != null) {
IOUtils.closeQuietly(fileCursorReader);
}
}
}

@ -1,6 +1,7 @@
package us.codecraft.webmagic.selector;
import com.jayway.jsonpath.JsonPath;
import us.codecraft.webmagic.utils.Experimental;
import java.util.ArrayList;
import java.util.List;
@ -12,6 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
@Experimental
public class JsonPathSelector implements Selector {
private String jsonPathStr;

@ -27,6 +27,9 @@ public class ExtractorUtils {
case XPath:
selector = getXpathSelector(value);
break;
case JsonPath:
selector = new JsonPathSelector(value);
break;
default:
selector = getXpathSelector(value);
}

@ -1,13 +1,15 @@
package us.codecraft.webmagic;
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
/**
* @author code4crafter@gmail.com
*/
public class MockDownloader implements Downloader{
public class MockGithubDownloader implements Downloader{
private String html = "\n" +
"\n" +

@ -2,7 +2,7 @@ package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo;
@ -22,6 +22,6 @@ public class GithubRepoTest {
Assert.assertEquals(86, o.getStar());
Assert.assertEquals(70, o.getFork());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
}

@ -3,6 +3,7 @@ package us.codecraft.webmagic.processor;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
@ -29,7 +30,7 @@ public class GithubRepoProcessor implements PageProcessor {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
}

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.0-SNAPSHOT</version>
<version>0.4.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.0</version>
<version>0.4.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -0,0 +1,59 @@
webmagic-scripts
======
## 目标:
使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。如果已经有人写好了脚本,那么你直接使用就可以了!
## 实例:
例如我需要抓github的仓库数据可以这样写一个脚本(javascript)
```javascript
var name=xpath("//h1[@class='entry-title public']/strong/a/text()")
var readme=xpath("//div[@id='readme']/tidyText()")
var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()")
var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
var url=page.getUrl().toString()
if (name!=null){
println(name)
println(readme)
println(star)
println(url)
}
urls("(https://github\\.com/\\w+/\\w+)")
urls("(https://github\\.com/\\w+)")
```
然后使用webmagic加载并启动它无需下载依赖、编写代码、执行的过程。目前已经有控制台版本请下载[http://code4craft.qiniudn.com/webmagic-console.tar.gz](http://code4craft.qiniudn.com/webmagic-console.tar.gz)。
解压后,使用以下命令执行:
java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f 脚本文件名 [-l 语言默认是javascript] [-t 线程数] [-s 抓取间隔,毫秒] url1 url2 …
例如对于github这个脚本我可以这样执行
java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f github.js -t 2 -s 0 https://github.com/code4craft
目前这部分使用Java的ScriptEngine机制完成。
## 语言:
选用javascript是因为用户面比较广。目前还支持ruby语言选用ruby是因为ruby的语法编写DSL更简洁
```ruby
name= xpath "//h1[@class='entry-title public']/strong/a/text()"
readme = xpath "//div[@id='readme']/tidyText()"
star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"
fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()"
url=$page.getUrl().toString()
puts name,readme,star,fork,url unless name==nil
urls "(https://github\\.com/\\w+/\\w+)"
urls "(https://github\\.com/\\w+)"
```
多语言通过参数-l区分例如执行这个ruby脚本需要
java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f github.rb -t2 -s0 -l ruby https://github.com/code4craft
这个功能目前仍在实验阶段。欢迎大家积极参与并提出意见。

@ -0,0 +1,5 @@
#!/bin/sh
VERSION="0.4.1-SNAPSHOT"
mvn clean package
cp target/webmagic-scripts-${VERSION}.jar /usr/local/webmagic/webmagic-console.jar
rsync -avz --delete target/lib/ /usr/local/webmagic/lib/

@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.2-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-scripts</artifactId>
<dependencies>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>./lib/</classpathPrefix>
<mainClass>us.codecraft.webmagic.scripts.ScriptConsole</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>

@ -0,0 +1,35 @@
package us.codecraft.webmagic.scripts;
/**
* @author code4crafter@gmail.com
*/
public enum Language {
JavaScript("javascript","js/defines.js",""),
JRuby("jruby","ruby/defines.rb","");
private String engineName;
private String defineFile;
private String gatherFile;
Language(String engineName, String defineFile, String gatherFile) {
this.engineName = engineName;
this.defineFile = defineFile;
this.gatherFile = gatherFile;
}
public String getEngineName() {
return engineName;
}
public String getDefineFile() {
return defineFile;
}
public String getGatherFile() {
return gatherFile;
}
}

@ -0,0 +1,183 @@
package us.codecraft.webmagic.scripts;
import com.google.common.collect.Sets;
import org.apache.commons.cli.*;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptConsole {
private static class Params {
Language language = Language.JavaScript;
String scriptFileName;
List<String> urls;
int thread = 1;
int sleepTime = 1000;
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
static {
alias.put(Language.JavaScript, Sets.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
alias.put(Language.JRuby, Sets.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
}
public void setLanguagefromArg(String arg) {
for (Map.Entry<Language, Set<String>> languageSetEntry : alias.entrySet()) {
if (languageSetEntry.getValue().contains(arg)) {
this.language = languageSetEntry.getKey();
return;
}
}
}
private Language getLanguage() {
return language;
}
private void setLanguage(Language language) {
this.language = language;
}
private String getScriptFileName() {
return scriptFileName;
}
private void setScriptFileName(String scriptFileName) {
this.scriptFileName = scriptFileName;
}
private List<String> getUrls() {
return urls;
}
private void setUrls(List<String> urls) {
this.urls = urls;
}
private int getThread() {
return thread;
}
private void setThread(int thread) {
this.thread = thread;
}
private int getSleepTime() {
return sleepTime;
}
private void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
}
public static void main(String[] args) {
Params params = parseCommand(args);
startSpider(params);
}
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404, 500));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
private static Params parseCommand(String[] args) {
try {
Options options = new Options();
options.addOption(new Option("l", "language", true, "language"));
options.addOption(new Option("t", "thread", true, "thread"));
options.addOption(new Option("f", "file", true, "script file"));
options.addOption(new Option("i", "input", true, "input file"));
options.addOption(new Option("s", "sleep", true, "sleep time"));
options.addOption(new Option("g", "logger", true, "sleep time"));
CommandLineParser commandLineParser = new PosixParser();
CommandLine commandLine = commandLineParser.parse(options, args);
return readOptions(commandLine);
} catch (Exception e) {
e.printStackTrace();
exit();
return null;
}
}
private static void exit() {
System.err.println("Format error");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
private static Params readOptions(CommandLine commandLine) {
Params params = new Params();
if (commandLine.hasOption("l")) {
String language = commandLine.getOptionValue("l");
params.setLanguagefromArg(language);
}
if (commandLine.hasOption("f")) {
String scriptFilename = commandLine.getOptionValue("f");
params.setScriptFileName(scriptFilename);
} else {
exit();
}
if (commandLine.hasOption("s")) {
Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
params.setSleepTime(sleepTime);
}
if (commandLine.hasOption("t")) {
Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
params.setThread(thread);
}
if (commandLine.hasOption("g")) {
configLogger(commandLine.getOptionValue("g"));
}
params.setUrls(commandLine.getArgList());
return params;
}
private static void configLogger(String value) {
Logger rootLogger = Logger.getRootLogger();
if ("debug".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.DEBUG);
} else if ("info".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.INFO);
} else if ("warn".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.WARN);
} else if ("trace".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.TRACE);
} else if ("off".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.OFF);
} else if ("error".equalsIgnoreCase(value)) {
rootLogger.setLevel(Level.ERROR);
}
}
}

@ -0,0 +1,39 @@
package us.codecraft.webmagic.scripts;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptEnginePool {
private final int size;
private final AtomicInteger availableCount;
private final LinkedBlockingQueue<ScriptEngine> scriptEngines = new LinkedBlockingQueue<ScriptEngine>();
public ScriptEnginePool(Language language,int size) {
this.size = size;
this.availableCount = new AtomicInteger(size);
for (int i=0;i<size;i++){
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName(language.getEngineName());
scriptEngines.add(engine);
}
}
public ScriptEngine getEngine() {
availableCount.decrementAndGet();
return scriptEngines.poll();
}
public void release(ScriptEngine scriptEngine){
scriptEngines.add(scriptEngine);
}
}

@ -0,0 +1,80 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import javax.script.ScriptContext;
import javax.script.ScriptEngine;
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessor implements PageProcessor {
private ScriptEnginePool enginePool;
private String defines;
private String script;
private final Language language;
private Site site = Site.me();
public ScriptProcessor(Language language, String script, int threadNum) {
if (language == null || script == null) {
throw new IllegalArgumentException("language and script must not be null!");
}
this.language = language;
enginePool = new ScriptEnginePool(language, threadNum);
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile());
try {
defines = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
this.script = script;
}
@Override
public void process(Page page) {
ScriptEngine engine = enginePool.getEngine();
try {
ScriptContext context = engine.getContext();
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
engine.eval(defines + "\n" + script, context);
// switch (language) {
// case JavaScript:
// NativeObject o = (NativeObject) engine.get("result");
// if (o != null) {
// for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
// }
// }
// break;
// case JRuby:
// Object o1 = engine.get("result");
// break;
// }
} catch (ScriptException e) {
e.printStackTrace();
}
} finally {
enginePool.release(engine);
}
}
@Override
public Site getSite() {
return site;
}
}

@ -0,0 +1,71 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.io.IOUtils;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorBuilder {
private static final Language DefaultLanguage = Language.JavaScript;
private Language language = DefaultLanguage;
private String script;
private int threadNum = 1;
private ScriptProcessorBuilder() {
}
public static ScriptProcessorBuilder custom() {
return new ScriptProcessorBuilder();
}
public ScriptProcessorBuilder language(Language language) {
this.language = language;
return this;
}
public ScriptProcessorBuilder scriptFromFile(String fileName) {
try {
InputStream resourceAsStream = new FileInputStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
try {
InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName);
this.script = IOUtils.toString(resourceAsStream);
} catch (IOException e) {
//wrap IOException because I prefer a runtime exception...
throw new IllegalArgumentException(e);
}
return this;
}
public ScriptProcessorBuilder script(String script) {
this.script = script;
return this;
}
public ScriptProcessorBuilder thread(int threadNum) {
this.threadNum = threadNum;
return this;
}
public ScriptProcessor build(){
return new ScriptProcessor(language,script,threadNum);
}
}

@ -0,0 +1,10 @@
function $(str){
return page.getHtml().$(str).toString();
}
function xpath(str){
return page.getHtml().xpath(str).toString();
}
function urls(str){
links = page.getHtml().links().regex(str).all();
page.addTargetRequests(links);
}

@ -0,0 +1,14 @@
var name=xpath("//h1[@class='entry-title public']/strong/a/text()")
var readme=xpath("//div[@id='readme']/tidyText()")
var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()")
var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
var url=page.getUrl().toString()
if (name!=null){
println(name)
println(readme)
println(star)
println(url)
}
urls("(https://github\\.com/\\w+/\\w+)")
urls("(https://github\\.com/\\w+)")

@ -0,0 +1,11 @@
var result = {
title: $("div.BlogTitle h1"),
content: $("div.BlogContent")
}
var config = {
ua: '',
sleepTime : 20
}
title = $("div.BlogTitle h1"),
content = $("div.BlogContent")
urls("http://my\\.oschina\\.net/flashsword/blog/\\d+")

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="error" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,11 @@
def xpath str
$page.getHtml().xpath(str).toString()
end
def css str
$page.getHtml().css(str).toString()
end
def urls str
links = $page.getHtml().links().regex(str).all();
$page.addTargetRequests(links);
end

@ -0,0 +1,10 @@
name= xpath "//h1[@class='entry-title public']/strong/a/text()"
readme = xpath "//div[@id='readme']/tidyText()"
star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"
fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()"
url=$page.getUrl().toString()
puts name,readme,star,fork,url unless name==nil
urls "(https://github\\.com/\\w+/\\w+)"
urls "(https://github\\.com/\\w+)"

@ -0,0 +1,3 @@
title = css "div.BlogTitle h1"
content = css "div.BlogContent"
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"

@ -0,0 +1,25 @@
package us.codecraft.webmagic.scripts;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
/**
* @author code4crafter@gmail.com
* @since 0.4.1
*/
public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
}

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.0</version>
<version>0.4.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Loading…
Cancel
Save