【代碼分享】系列之樸素貝葉斯(github clone)
前言
樸素貝葉斯是一種使用概率論來分類的演算法。其中
樸素
:各特徵條件獨立
;貝葉斯
:根據貝葉斯定理。
根據貝葉斯定理,對一個分類問題,給定樣本特徵x,樣本屬於類別y的概率是:
在這裡,x 是一個特徵向量,設 x 維度為 M。因為樸素的假設,即特徵條件獨立,根據全概率公式展開,上式可以表達為:
這裡,只要分別估計出,特徵 Χ
i
在每一類的條件概率就可以了。類別 y 的先驗概率可以通過訓練集算出,同樣通過訓練集上的統計,可以得出對應每一類上的,條件獨立的特徵對應的條件概率向量。內容整理自
《機器學習實戰》
優點:
在數據較少的情況下仍然有效,可以處理多類別問題。缺點:
對於輸入數據的準備方式較為敏感。適用數據類型:
標稱型數據。選自:https://www.cnblogs.com/hemiy/p/6194710.html
詳細的原理請看之前的文章:
機器學習(10)之趣味案例理解樸素貝葉斯
機器學習(6)之樸素貝葉斯NB及實例
乾貨 | 一文讀懂什麼是貝葉斯機器學習
Machine Learning -- Naive Bayes(樸素貝葉斯)
機器學習(3) -- 貝葉斯及正則化
......
更多文章點擊站內搜索鏈接:
http://urlort.cn/4yybf9
以垃圾郵件分類為例,
要從文本中獲取特徵,需要先拆分文本。
可以把詞條想像為單詞,也可以使用非單詞詞條,如URL、IP
地址或者任意其他字元串。然後將每一個文本片段表示為一個詞條向量,其中值為
1
表示詞條出現在文檔中,
0
表示詞條未出現。
點擊閱讀原文即可下載代碼
源碼鏈接:https://github.com/Wellat/MLaction/blob/master/Ch04_NaiveBayes/bayes.py
代碼邏輯:
準備數據:從文本中構建詞向量;
訓練演算法:從詞向量計算概率,
計算每個類別的條件概率,偽代碼:
3. 進行測試
# -*-coding:utf-8 -*-
"""
"""
from numpy import *
def loadDataSet():
"""
postingList: 進行詞條切分後的文檔集合
classVec:類別標籤
"""
postingList=[["my", "dog", "has", "flea", "problems", "help", "please"],
["maybe", "not", "take", "him", "to", "dog", "park", "stupid"],
["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
["stop", "posting", "stupid", "worthless", "garbage"],
["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
["quit", "buying", "worthless", "dog", "food", "stupid"]]
classVec = [0,1,0,1,0,1] #1代表侮辱性文字,0代表正常言論
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])#使用set創建不重複詞表庫
for document in dataSet:
vocabSet = vocabSet | set(document) #創建兩個集合的並集
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)#創建一個所包含元素都為0的向量
#遍歷文檔中的所有單詞,如果出現了辭彙表中的單詞,則將輸出的文檔向量中的對應值設為1
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print("the word: %s is not in my Vocabulary!" % word)
return returnVec
"""
我們將每個詞的出現與否作為一個特徵,這可以被描述為詞集模型(set-of-words model)。
如果一個詞在文檔中出現不止一次,這可能意味著包含該詞是否出現在文檔中所不能表達的某種信息,
這種方法被稱為詞袋模型(bag-of-words model)。
在詞袋中,每個單詞可以出現多次,而在詞集中,每個詞只能出現一次。
為適應詞袋模型,需要對函數setOfWords2Vec稍加修改,修改後的函數稱為bagOfWords2VecMN
"""
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def trainNB0(trainMatrix,trainCategory):
"""
樸素貝葉斯分類器訓練函數(此處僅處理兩類分類問題)
trainMatrix:文檔矩陣
trainCategory:每篇文檔類別標籤
"""
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
#初始化所有詞出現數為1,並將分母初始化為2,避免某一個概率值為0
p0Num = ones(numWords); p1Num = ones(numWords)#
p0Denom = 2.0; p1Denom = 2.0 #
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#將結果取自然對數,避免下溢出,即太多很小的數相乘造成的影響
p1Vect = log(p1Num/p1Denom) #change to log()
p0Vect = log(p0Num/p0Denom) #change to log()
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
"""
分類函數
vec2Classify:要分類的向量
p0Vec, p1Vec, pClass1:分別對應trainNB0計算得到的3個概率
"""
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
#訓練模型,注意此處使用array
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ["love", "my", "dalmation"]
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry,"classified as: ",classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ["stupid", "garbage"]
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry,"classified as: ",classifyNB(thisDoc,p0V,p1V,pAb))
def textParse(bigString):#
"""
文本切分
輸入文本字元串,輸出詞表
"""
import re
listOfTokens = re.split(r"W*", bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
"""
垃圾郵件測試函數
"""
docList=[]; classList = []; fullText =[]
for i in range(1,26):
#讀取垃圾郵件
wordList = textParse(open("email/spam/%d.txt" % i,"r",encoding= "utf-8").read())
docList.append(wordList)
fullText.extend(wordList)
#設置垃圾郵件類標籤為1
classList.append(1)
wordList = textParse(open("email/ham/%d.txt" % i,"r",encoding= "utf-8").read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#生成次表庫
trainingSet = list(range(50))
testSet=[] #
#隨機選10組做測試集
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#生成訓練矩陣及標籤
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
#測試並計算錯誤率
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print("classification error",docList[docIndex])
print("the error rate is: ",float(errorCount)/len(testSet))
#return vocabList,fullText
def calcMostFreq(vocabList,fullText):
"""
返回前30個高頻詞
"""
import operator
freqDict = {}
for token in vocabList:
freqDict[token]=fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
"""
函數localWords()與程序清單中的spamTest()函數幾乎相同,區別在於這裡訪問的是
RSS源而不是文件。然後調用函數calcMostFreq()來獲得排序最高的30個單詞並隨後將它們移除
"""
def localWords(feed1,feed0):
import feedparser
docList=[]; classList = []; fullText =[]
minLen = min(len(feed1["entries"]),len(feed0["entries"]))
for i in range(minLen):
wordList = textParse(feed1["entries"][i]["summary"])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1) #NY is class 1
wordList = textParse(feed0["entries"][i]["summary"])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#create vocabulary
top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words
for pairW in top30Words:
if pairW[0] in vocabList: vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen)); testSet=[] #create test set
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print("the error rate is: ",float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def getTopWords(ny,sf):
import operator
vocabList,p0V,p1V=localWords(ny,sf)
topNY=[]; topSF=[]
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print(item[0])
if __name__== "__main__":
testingNB()
#導入RSS數據源
# import operator
# ny=feedparser.parse("http://newyork.craigslist.org/stp/index.rss")
# sf=feedparser.parse("http://sfbay.craigslist.org/stp/index.rss")
# localWords(ny,sf)
近期熱文
機器學習匯總,珍藏版!
10個Python面試常問的問題
大福利!Google機器學習零基礎在線課程發布,免費!有中文版!
長文 | LSTM和循環神經網路基礎教程(PDF下載)
廣告、商業合作
請添加微信:guodongwe1991
(備註:商務合作)