大家好,又见面了,我是你们的朋友全栈君。如果您正在找激活码,请点击查看最新教程,关注关注公众号 “全栈程序员社区” 获取激活教程,可能之前旧版本教程已经失效.最新Idea2022.1教程亲测有效,一键激活。
Jetbrains全家桶1年46,售后保障稳定
源码如下:
import jieba
import io
import re
#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')
#添加字典
def add_dict():
f=open("E:/xinxi2.txt","r+",encoding="utf-8") #百度爬取的字典
for line in f:
jieba.suggest_freq(line.rstrip("\n"), True)
f.close()
#对句子进行分词
def cut():
number=0
f=open("E:/luntan.txt","r+",encoding="utf-8") #要处理的内容,所爬信息,CSDN论坛标题
for line in f:
line=seg_sentence(line.rstrip("\n"))
seg_list=jieba.cut(line)
for i in seg_list:
print(i) #打印词汇内容
m=patton.findall(i)
#print(len(m)) #打印字符长度
if len(m)!=0:
write(i.strip()+" ")
line=line.rstrip().lstrip()
print(len(line))#打印句子长度
if len(line)>1:
write("\n")
number+=1
print("已处理",number,"行")
#分词后写入
def write(contents):
f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要写入的文件
f.write(contents)
#print("写入成功!")
f.close()
#创建停用词
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行去除停用词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('E://stop.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
#outstr += " "
return outstr
#循环去除、无用函数
def cut_all():
inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
outputs = open('E//luntan_stop.txt', 'a')
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
if __name__=="__main__":
add_dict()
cut()
luntan.txt的来源,地址:https://www.cnblogs.com/zlc364624/p/12285055.html
其中停用词可自行百度下载,或者自己创建一个txt文件夹,自行添加词汇用换行符隔开。
百度爬取的字典在前几期博客中可以找到,地址:https://www.cnblogs.com/zlc364624/p/12289008.html
效果如下:
import jieba
import io
import re
#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')
#添加字典
def add_dict():
f=open("E:/xinxi2.txt","r+",encoding="utf-8") #百度爬取的字典
for line in f:
jieba.suggest_freq(line.rstrip("\n"), True)
f.close()
#对句子进行分词
def cut():
number=0
f=open("E:/luntan.txt","r+",encoding="utf-8") #要处理的内容,所爬信息,CSDN论坛标题
for line in f:
line=seg_sentence(line.rstrip("\n"))
seg_list=jieba.cut(line)
for i in seg_list:
print(i) #打印词汇内容
m=patton.findall(i)
#print(len(m)) #打印字符长度
if len(m)!=0:
write(i.strip()+" ")
line=line.rstrip().lstrip()
print(len(line))#打印句子长度
if len(line)>1:
write("\n")
number+=1
print("已处理",number,"行")
#分词后写入
def write(contents):
f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要写入的文件
f.write(contents)
#print("写入成功!")
f.close()
#创建停用词
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行去除停用词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('E://stop.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
#outstr += " "
return outstr
#循环去除、无用函数
def cut_all():
inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
outputs = open('E//luntan_stop.txt', 'a')
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
if __name__=="__main__":
add_dict()
cut()
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/223073.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...