merge change from master

pull/100/merge
Sun Junyi 12 years ago
commit 6549deabbd

1
.gitignore vendored

@ -162,4 +162,5 @@ pip-log.txt
# Mac crap # Mac crap
.DS_Store .DS_Store
*.log *.log
test/tmp/*

@ -1,7 +1,9 @@
from __future__ import with_statement from __future__ import with_statement
import re import re
import math import math
import os,sys import os
import sys
import pprint import pprint
from . import finalseg from . import finalseg
import time import time
@ -103,16 +105,18 @@ def initialize(*args):
def require_initialized(fn): def require_initialized(fn):
global initialized,DICTIONARY global initialized,DICTIONARY
@wraps(fn) @wraps(fn)
def wrapped(*args, **kwargs): def wrapped(*args, **kwargs):
if initialized: if initialized:
return fn(*args, **kwargs) return fn(*args, **kwargs)
else: else:
initialize(DICTIONARY) initialize(DICTIONARY)
return fn(*args, **kwargs) return fn(*args, **kwargs)
return wrapped
return wrapped
def __cut_all(sentence): def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
@ -212,7 +216,8 @@ def cut(sentence,cut_all=False):
except UnicodeDecodeError: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
if cut_all: if cut_all:
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]") re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
@ -223,9 +228,9 @@ def cut(sentence,cut_all=False):
cut_block = __cut_all cut_block = __cut_all
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
#pprint.pprint(__cut_DAG(blk)) #pprint.pprint(__cut_DAG(blk))
for word in cut_block(blk): for word in cut_block(blk):
yield word yield word
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:

@ -1,6 +1,11 @@
import jieba import jieba
import os import os
try:
from analyzer import ChineseAnalyzer
except ImportError:
pass
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
f_name = os.path.join(_curpath,"idf.txt") f_name = os.path.join(_curpath,"idf.txt")
content = open(f_name,'rb').read().decode('utf-8') content = open(f_name,'rb').read().decode('utf-8')

@ -59,7 +59,7 @@ def cut(sentence):
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:

@ -135,7 +135,7 @@ def __cut_internal(sentence):
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)") re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)

@ -96,4 +96,4 @@ if __name__ == "__main__":
cuttest('枪杆子中出政权') cuttest('枪杆子中出政权')
cuttest('张三风同学走上了不归路') cuttest('张三风同学走上了不归路')
cuttest('阿Q腰间挂着BB机手里拿着大哥大我一般吃饭不AA制的。') cuttest('阿Q腰间挂着BB机手里拿着大哥大我一般吃饭不AA制的。')
cuttest('在1号店能买到小S和大S八卦的书') cuttest('在1号店能买到小S和大S八卦的书还有3D电视')

@ -1,5 +1,5 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import sys import sys,os
sys.path.append("../") sys.path.append("../")
from whoosh.index import create_in,open_dir from whoosh.index import create_in,open_dir
from whoosh.fields import * from whoosh.fields import *
@ -10,6 +10,9 @@ from jieba.analyse.analyzer import ChineseAnalyzer
analyzer = ChineseAnalyzer() analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = create_in("tmp", schema) # for create new index ix = create_in("tmp", schema) # for create new index
#ix = open_dir("tmp", schema=schema) # for read only #ix = open_dir("tmp", schema=schema) # for read only
writer = ix.writer() writer = ix.writer()

@ -1,5 +1,6 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import sys import sys
import os
sys.path.append("../") sys.path.append("../")
from whoosh.index import create_in from whoosh.index import create_in
from whoosh.fields import * from whoosh.fields import *
@ -10,6 +11,8 @@ from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer() analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = create_in("tmp", schema) ix = create_in("tmp", schema)
writer = ix.writer() writer = ix.writer()

@ -0,0 +1,27 @@
# -*- coding: UTF-8 -*-
import sys
import os
sys.path.append("../")
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = open_dir("tmp")
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
print "result of ",keyword
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
Loading…
Cancel
Save