mirror of https://github.com/fxsjy/jieba.git
first commit
commit
6f6e812afb
@ -0,0 +1,22 @@
|
||||
# Auto detect text files and perform LF normalization
|
||||
* text=auto
|
||||
|
||||
# Custom for Visual Studio
|
||||
*.cs diff=csharp
|
||||
*.sln merge=union
|
||||
*.csproj merge=union
|
||||
*.vbproj merge=union
|
||||
*.fsproj merge=union
|
||||
*.dbproj merge=union
|
||||
|
||||
# Standard to msysgit
|
||||
*.doc diff=astextplain
|
||||
*.DOC diff=astextplain
|
||||
*.docx diff=astextplain
|
||||
*.DOCX diff=astextplain
|
||||
*.dot diff=astextplain
|
||||
*.DOT diff=astextplain
|
||||
*.pdf diff=astextplain
|
||||
*.PDF diff=astextplain
|
||||
*.rtf diff=astextplain
|
||||
*.RTF diff=astextplain
|
@ -0,0 +1,163 @@
|
||||
#################
|
||||
## Eclipse
|
||||
#################
|
||||
|
||||
*.pydevproject
|
||||
.project
|
||||
.metadata
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.classpath
|
||||
.settings/
|
||||
.loadpath
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# CDT-specific
|
||||
.cproject
|
||||
|
||||
# PDT-specific
|
||||
.buildpath
|
||||
|
||||
|
||||
#################
|
||||
## Visual Studio
|
||||
#################
|
||||
|
||||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.sln.docstates
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Rr]elease/
|
||||
*_i.c
|
||||
*_p.c
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.vspscc
|
||||
.builds
|
||||
*.dotCover
|
||||
|
||||
## TODO: If you have NuGet Package Restore enabled, uncomment this
|
||||
#packages/
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish
|
||||
|
||||
# Others
|
||||
[Bb]in
|
||||
[Oo]bj
|
||||
sql
|
||||
TestResults
|
||||
*.Cache
|
||||
ClientBin
|
||||
stylecop.*
|
||||
~$*
|
||||
*.dbmdl
|
||||
Generated_Code #added for RIA/Silverlight projects
|
||||
|
||||
# Backup & report files from converting an old project file to a newer
|
||||
# Visual Studio version. Backup files are not needed, because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
|
||||
|
||||
|
||||
############
|
||||
## Windows
|
||||
############
|
||||
|
||||
# Windows image file caches
|
||||
Thumbs.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
|
||||
#############
|
||||
## Python
|
||||
#############
|
||||
|
||||
*.py[co]
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist
|
||||
build
|
||||
eggs
|
||||
parts
|
||||
bin
|
||||
var
|
||||
sdist
|
||||
develop-eggs
|
||||
.installed.cfg
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
|
||||
#Translations
|
||||
*.mo
|
||||
|
||||
#Mr Developer
|
||||
.mr.developer.cfg
|
||||
|
||||
# Mac crap
|
||||
.DS_Store
|
@ -0,0 +1,60 @@
|
||||
import re
|
||||
import math
|
||||
import os,sys
|
||||
import pprint
|
||||
|
||||
def gen_trie(f_name):
|
||||
trie = {}
|
||||
for line in open(f_name):
|
||||
word,freq = line.strip().split(" ")
|
||||
word = word.decode('utf-8')
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
return trie
|
||||
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
print >> sys.stderr, "loading dictionary..."
|
||||
trie = gen_trie(os.path.join(_curpath,"dict.txt"))
|
||||
print >> sys.stderr,"done."
|
||||
|
||||
def __cut(sentence):
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
while i<N:
|
||||
c = sentence[j]
|
||||
if c in p:
|
||||
p = p[c]
|
||||
if '' in p:
|
||||
yield sentence[i:j+1]
|
||||
j+=1
|
||||
if j>=N:
|
||||
i+=1
|
||||
j=i
|
||||
else:
|
||||
p = trie
|
||||
i+=1
|
||||
j=i
|
||||
|
||||
def cut(sentence):
|
||||
if not ( type(sentence) is unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
||||
blocks = re_han.split(sentence)
|
||||
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
yield x
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,15 @@
|
||||
#encoding=utf-8
|
||||
import sys
|
||||
import jieba
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent)
|
||||
for word in result:
|
||||
print word, "/",
|
||||
print ""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
Loading…
Reference in New Issue