﻿ 数据挖掘系列（8）朴素贝叶斯分类算法原理与实践-一起大数据

数据挖掘系列（8）朴素贝叶斯分类算法原理与实践

`8343 10000 0.8343`

```#!encoding=utf-8
import random
import sys
import math
import collections
import sys
def shuffle():
'''将原来的文本打乱顺序，用于得到训练集和测试集'''
datas = [line.strip() for line in sys.stdin]
random.shuffle(datas)
for line in datas:
print line

lables = ['A','B','C','D','E','F','G','H','I']
def lable2id(lable):
for i in xrange(len(lables)):
if lable == lables[i]:
return i
raise Exception('Error lable %s' % (lable))

def docdict():
return [0]*len(lables)

def mutalInfo(N,Nij,Ni_,N_j):
#print N,Nij,Ni_,N_j
return Nij * 1.0 / N * math.log(N * (Nij+1)*1.0/(Ni_*N_j))/ math.log(2)

def countForMI():
'''基于统计每个词在每个类别出现的次数，以及每类的文档数'''
docCount = [0] * len(lables)#每个类的词数目
wordCount = collections.defaultdict(docdict)
for line in sys.stdin:
lable,text = line.strip().split(' ',1)
index = lable2id(lable[0])
words = text.split(' ')
for word in words:
wordCount[word][index] += 1
docCount[index] += 1

miDict = collections.defaultdict(docdict)#互信息值
N = sum(docCount)
for k,vs in wordCount.items():
for i in xrange(len(vs)):
N11 = vs[i]
N10 = sum(vs) - N11
N01 = docCount[i] - N11
N00 = N - N11 - N10 - N01
mi = mutalInfo(N,N11,N10+N11,N01+N11) + mutalInfo(N,N10,N10+N11,N00+N10)+ mutalInfo(N,N01,N01+N11,N01+N00)+ mutalInfo(N,N00,N00+N10,N00+N01)
miDict[k][i] = mi
fWords = set()
for i in xrange(len(docCount)):
keyf = lambda x:x[1][i]
sortedDict = sorted(miDict.items(),key=keyf,reverse=True)
for j in xrange(100):
print docCount#打印各个类的文档数目
for fword in fWords:
print fword

'''导入特征词'''
f = open('feature.txt')
features = set()
for line in f:
f.close()
return docCounts,features

def trainBayes():
'''训练贝叶斯模型，实际上计算每个类中特征词的出现次数'''
wordCount = collections.defaultdict(docdict)
tCount = [0]*len(docCounts)#每类文档特征词出现的次数
for line in sys.stdin:
lable,text = line.strip().split(' ',1)
index = lable2id(lable[0])
words = text.split(' ')
for word in words:
if word in features:
tCount[index] += 1
wordCount[word][index] += 1
for k,v in wordCount.items():
scores = [(v[i]+1) * 1.0 / (tCount[i]+len(wordCount)) for i in xrange(len(v))]#加1平滑
print '%s\t%s' % (k,scores)

'''导入贝叶斯模型'''
f = open('model.txt')
scores = {}
for line in f:
word,counts = line.strip().rsplit('\t',1)
scores[word] = eval(counts)
f.close()
return scores

def predict():
'''预测文档的类标，标准输入每一行为一个文档'''
docscores = [math.log(count * 1.0 /sum(docCounts)) for count in docCounts]
rCount = 0
docCount = 0
for line in sys.stdin:
lable,text = line.strip().split(' ',1)
index = lable2id(lable[0])
words = text.split(' ')
preValues = list(docscores)
for word in words:
if word in features:
for i in xrange(len(preValues)):
preValues[i]+=math.log(scores[word][i])
m = max(preValues)
pIndex = preValues.index(m)
if pIndex == index:
rCount += 1
print lable,lables[pIndex],text
docCount += 1
print rCount,docCount,rCount * 1.0 / docCount

if __name__=="__main__":
#shuffle()
#countForMI()
#trainBayes()
predict()

```

`\$cat train.txt | python bayes.py > feature.txt`

`\$cat train.txt | python bayes.py > model.txt`

`\$cat test.txt | python bayes.py > predict.out`

本文介绍了朴素贝叶斯分类方法，还以文本分类为例，给出了一个具体应用的例子，朴素贝叶斯的朴素体现在条件变量之间的独立性假设，应用到文本分类上，作了两个假设，一是各个特征词对分类的影响是独立的，另一个是词项在文档中的顺序是无关紧要的。朴素贝叶斯的独立性假设在实际中并不成立，但在分类效上依然不错，加上独立性假设后，对与属于类ck的谋篇文档d，其p(ck|d)往往会估计过高，即本来预期p(ck|d)=0.55，而朴素贝叶斯却计算得到p(ck|d)=0.99，但这并不影响分类结果，这是朴素贝叶斯分类器在文本分类上效果优于预期的原因。

王斌 译.信息检索导论. 人民邮电出版社

codemeals. 文本特征选择. cnblogs.