merge change from master

pull/100/merge
Sun Junyi 12 years ago
commit 6549deabbd

1
.gitignore vendored

@ -162,4 +162,5 @@ pip-log.txt
# Mac crap
.DS_Store
*.log
test/tmp/*

@ -1,7 +1,9 @@
from __future__ import with_statement
import re
import math
import os,sys
import os
import sys
import pprint
from . import finalseg
import time
@ -103,16 +105,18 @@ def initialize(*args):
def require_initialized(fn):
global initialized,DICTIONARY
@wraps(fn)
def wrapped(*args, **kwargs):
if initialized:
return fn(*args, **kwargs)
else:
initialize(DICTIONARY)
return fn(*args, **kwargs)
return wrapped
global initialized,DICTIONARY
@wraps(fn)
def wrapped(*args, **kwargs):
if initialized:
return fn(*args, **kwargs)
else:
initialize(DICTIONARY)
return fn(*args, **kwargs)
return wrapped
def __cut_all(sentence):
dag = get_DAG(sentence)
@ -212,7 +216,8 @@ def cut(sentence,cut_all=False):
except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
if cut_all:
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
@ -223,9 +228,9 @@ def cut(sentence,cut_all=False):
cut_block = __cut_all
for blk in blocks:
if re_han.match(blk):
#pprint.pprint(__cut_DAG(blk))
for word in cut_block(blk):
yield word
#pprint.pprint(__cut_DAG(blk))
for word in cut_block(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:

@ -1,6 +1,11 @@
import jieba
import os
try:
from analyzer import ChineseAnalyzer
except ImportError:
pass
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
f_name = os.path.join(_curpath,"idf.txt")
content = open(f_name,'rb').read().decode('utf-8')

@ -59,7 +59,7 @@ def cut(sentence):
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:

@ -135,7 +135,7 @@ def __cut_internal(sentence):
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)

@ -96,4 +96,4 @@ if __name__ == "__main__":
cuttest('枪杆子中出政权')
cuttest('张三风同学走上了不归路')
cuttest('阿Q腰间挂着BB机手里拿着大哥大我一般吃饭不AA制的。')
cuttest('在1号店能买到小S和大S八卦的书')
cuttest('在1号店能买到小S和大S八卦的书还有3D电视')

@ -1,5 +1,5 @@
# -*- coding: UTF-8 -*-
import sys
import sys,os
sys.path.append("../")
from whoosh.index import create_in,open_dir
from whoosh.fields import *
@ -10,6 +10,9 @@ from jieba.analyse.analyzer import ChineseAnalyzer
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = create_in("tmp", schema) # for create new index
#ix = open_dir("tmp", schema=schema) # for read only
writer = ix.writer()

@ -1,5 +1,6 @@
# -*- coding: UTF-8 -*-
import sys
import os
sys.path.append("../")
from whoosh.index import create_in
from whoosh.fields import *
@ -10,6 +11,8 @@ from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = create_in("tmp", schema)
writer = ix.writer()

@ -0,0 +1,27 @@
# -*- coding: UTF-8 -*-
import sys
import os
sys.path.append("../")
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = open_dir("tmp")
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
print "result of ",keyword
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
Loading…
Cancel
Save