全部課程
發(fā)布時(shí)間: 2019-09-08 13:28:19
7、使用第5步中訓(xùn)練好的模型,根據(jù)第6步提取的特征向量對(duì)郵件進(jìn)行分類。
2.代碼目錄結(jié)構(gòu)
3.編寫mail_savemodel.py文件
3.1.導(dǎo)入需要用到的標(biāo)準(zhǔn)庫(kù)和擴(kuò)展庫(kù)對(duì)象
from re import sub
from os import listdir
from collections import Counter
from itertools import chain
from numpy import array
from jieba import cut
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB
?3.2.讀取全部訓(xùn)練集,刪除干擾字符或長(zhǎng)度為1的單詞
?#存放所有文件中的單詞
?#每個(gè)元素是一個(gè)子列表,其中存放一個(gè)文件中的單詞
?allWords = []
?def getWordsFromFile(txtFile):
? words = []
? with open(txtFile,encoding="utf8") as fp:
? for line in fp:
? line = line.strip()
? #過(guò)濾干擾字符或無(wú)效字符
? line = sub(r'[.【】0-9、-。,!~\*]','',line)
? line = cut(line)
? words.extend(line)
? return words
??3.3.獲取并返回出現(xiàn)次數(shù)最多的前topN個(gè)單詞
?def getTopNWords(topN):
? #按文件編號(hào)順序處理當(dāng)前文件夾中所有記事本文件
? #共151封郵件內(nèi)容,0.txt到126.txt是垃圾郵件內(nèi)容
? #127.txt到150.txt為正常郵件內(nèi)容
? txtFiles = ["data/"+str(i)+".txt" for i in range(151)]
? #獲取全部單詞
? for txtFile in txtFiles:
? allWords.append(getWordsFromFile(txtFile))
? #獲取并返回出現(xiàn)次數(shù)最多的前topN個(gè)單詞
? freq = Counter(chain(*allWords))
? return [w[0] for w in freq.most_common(topN)]
?#全部訓(xùn)練集中出現(xiàn)次數(shù)最多的前600個(gè)單詞
?topWords = getTopNWords(600)
??3.4.創(chuàng)建貝葉斯模型,使用已有數(shù)據(jù)進(jìn)行訓(xùn)練
?#獲取特征向量,前600個(gè)單詞的每個(gè)單詞在每個(gè)郵件中出現(xiàn)的頻率
?vector = []
?for words in allWords:
temp = list(map(lambda x:words.count(x),topWords))
? vector.append(temp)
?vector = array(vector)
?#郵件標(biāo)簽,1表示垃圾郵件,0表示正常郵件
?labels = array([1]*127+[0]*24)
?#創(chuàng)建模型,使用已知訓(xùn)練集進(jìn)行訓(xùn)練
?model = MultinomialNB()
?model.fit(vector,labels)
??3.5.保存模型
joblib.dump(model,"垃圾郵件分類器.pkl")
with open("topWords.txt","w",encoding="utf8") as fp:
fp.write(",".join(topWords))
print("保存topWords成功.")
?4.編寫mail_loadmodel.py文件
?4.1.加載模型
def getWordsFromFile(txtFile):
words = []
with open(txtFile,encoding="utf8") as fp:
for line in fp:
line = line.strip()
#過(guò)濾干擾字符或無(wú)效字符
line = sub(r'[.【】0-9、-。,!~\*]','',line)
line = cut(line)
words.extend(line)
return words
model = joblib.load("垃圾郵件分類器.pkl")
print('加載模型和訓(xùn)練結(jié)果成功。')
with open("topWords.txt",encoding="utf8") as fp:
topWords = fp.read().split(",")
?4.2.使用訓(xùn)練好的模型對(duì)未知郵件內(nèi)容進(jìn)行分類。
def predict(txtFile):
#獲取指定郵件文件內(nèi)容,返回分類結(jié)果
words = getWordsFromFile(txtFile)
currentVector = array(tuple(map(lambda x:words.count(x),topWords)))
result = model.predict(currentVector.reshape(1,-1))
return "垃圾郵件" if result==1 else "正常郵件"
#151.txt至155.txt為測(cè)試郵件的內(nèi)容
for mail in ('data/%d.txt'%i for i in range(151,156)):
print(mail,predict(mail),sep=":")
?