本文共 5770 字,大约阅读时间需要 19 分钟。
朴素贝叶斯分类
#coding=utf-8'''Created on 2016年1月9日@author: admin'''from numpy import *# 加载数据集函数def loadDataSet(): # 定义邮件列表 postingList = [['my','dog','has','flea',\ 'problem','help','please'], ['maybe','not','take','him',\ 'to','dog','park','stupid'], ['my','dalmation','is','so','cute',\ 'I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how',\ 'to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] # 定义标签向量,1——abusive,0——not abusive classVec = [0,1,0,1,0,1] return postingList,classVec# 创建词汇列表def createVocabList(dataSet): # 定义词汇集 vocabSet = set([]) # 遍历文档 for document in dataSet: # 将每个document合并到vocabSet,|用来联合两个集合 vocabSet = vocabSet | set(document) # 返回词汇集 return list(vocabSet)# 把单词转换成向量def setOfWords2Vec(vocabList,inputSet): # 定义要返回的向量 returnVec = [0] * len(vocabList) # 遍历输出集中的单词 for word in inputSet: # 单词在词汇集中 if word in vocabList: # 对应的位置设为1 returnVec[vocabList.index(word)] = 1 else: print "the word: %s is not in my Vocabulary!" % word # 返回向量 return returnVec# 把单词转换成向量,用词袋模型,计算词出现的次数def bagOfWords2VecMN(vocabList,inputSet): # 定义要返回的向量 returnVec = [0] * len(vocabList) # 遍历输出集中的单词 for word in inputSet: # 单词在词汇集中 if word in vocabList: # 对应的词出现次数 加1 returnVec[vocabList.index(word)] += 1 # 返回向量 return returnVec# 条件概率的计算def trainNB0(trainMatrix,trainCategory): # 计算文档的数目 numTrainDocs = len(trainMatrix) # 计算单词的数目 numWords = len(trainMatrix[0]) # 计算类别的概率,abusive为1,not abusive为0 pAbusive =sum(trainCategory) / float(numTrainDocs) # 初始化计数器,1行numWords列,p0是not abusive # p0Num =zeros(numWords) p0Num = ones(numWords) # 初始化计数器,p1是abusive p1Num = ones(numWords) # 初始化分母 p0Denom = 2.0 p1Denom = 2.0 # 遍历文档 for i in range(numTrainDocs): # 计算abusive对应的词汇的数目,trainMatrix为0-1值形成的向量 if trainCategory[i] == 1: # p1Num存储的是每个词出现的次数 p1Num += trainMatrix[i] # p1Denom存储的是词的总数目 p1Denom += sum(trainMatrix[i]) # 计算not abusive词汇的数目 else: # 每个词在not abusive下出现的次数 p0Num += trainMatrix[i] # not abusive下的总词数 p0Denom += sum(trainMatrix[i]) # 计算abusive下每个词出现的概率 # p1Vect = p1Num / p1Denom p1Vect = log(p1Num / p1Denom) # 计算not abusive下每个词出现的概率 # p0Vect = p0Num / p0Denom p0Vect = log(p0Num / p0Denom) # 返回词出现的概率和文档为abusive的概率,not abusive的概率为1-pAbusive return p0Vect,p1Vect,pAbusivedef classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): # 计算abusive的概率 p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 计算not abusive的概率 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 根据概率大小判断属于哪个类 if p1 > p0: return 1 else: return 0# 测试def testingNB(): # 加载数据集 listOPosts,listClass = loadDataSet() # 创建词汇列表 myVocabList = createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClass)) # print p0V,p1V,pAb # print trainMat testEntry = ['love','my','dalmation'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb)# 文本解析 # 输入是字符串,输出是单词列表def textParse(bigString): # 导入正则表达式的包 import re # 用正则表达式分割字符串 listOfTokens = re.split(r'\W*', bigString) # 返回小写单词列表 return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 垃圾邮件测试 def spamTest(): # 定义docList文档列表,classList类别列表,fullText所有文档词汇 docList=[]; classList = []; fullText =[] # 遍历email/spam和email/ham下的txt文件 for i in range(1,26): # 定义并读取垃圾邮件文件的词汇分割列表 wordList = textParse(open('email/spam/%d.txt' % i).read()) # 将词汇列表加到文档列表中 docList.append(wordList) # 将所有词汇列表汇总到fullText中 fullText.extend(wordList) # 文档类别为1,spam classList.append(1) # 读取非垃圾邮件的文档 wordList = textParse(open('email/ham/%d.txt' % i).read()) # 添加到文档列表中 docList.append(wordList) # 添加到所有词汇列表中 fullText.extend(wordList) # 类别为0,非垃圾邮件 classList.append(0) # 创建词汇列表 vocabList = createVocabList(docList) # 定义训练集的索引和测试集 trainingSet = range(50); testSet=[] # 随机的选择10个作为测试集 for i in range(10): # 随机索引 randIndex = int(random.uniform(0,len(trainingSet))) # 将随机选择的文档加入到测试集中 testSet.append(trainingSet[randIndex]) # 从训练集中删除随机选择的文档 del(trainingSet[randIndex]) # 定义训练集的矩阵和类别 trainMat=[]; trainClasses = [] # 遍历训练集,求得先验概率和条件概率 for docIndex in trainingSet: # 将词汇列表变为向量放到trainMat中 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) # 训练集的类别标签 trainClasses.append(classList[docIndex]) # 计算先验概率,条件概率 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) # 定义错误计数 errorCount = 0 # 对测试集进行分类 for docIndex in testSet: # 将测试集词汇向量化 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) # 对测试数据进行分类 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: # 分类不正确,错误计数加1 errorCount += 1 print "classification error",docList[docIndex] # 输出错误率 print 'the error rate is: ',float(errorCount)/len(testSet)
转载地址:http://twwui.baihongyu.com/