Naive Bayesian文本分类器

最后更新于:2022-04-01 20:31:35

贝叶斯学习方法中实用性很高的一种为朴素贝叶斯学习期,常被称为朴素贝叶斯分类器。在某些领域中与神经网络和决策树学习相当。虽然朴素贝叶斯分类器忽略单词间的依赖关系,即假设所有单词是条件独立的,但朴素贝叶斯分类在实际应用中有很出色的表现。 朴素贝叶斯文本分类算法伪代码: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6f3e70e.jpg) 朴素贝叶斯文本分类算法流程: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6f5f2c7.jpg) 通过计算训练集中每个类别的概率与不同类别下每个单词的概率,然后利用朴素贝叶斯公式计算新文档被分类为各个类别的概率,最终输出概率最大的类别。 C++源码: ~~~ /* Bayesian classifier for document classifiaction 15S103182 Ethan 2015.12.27 */ #include #include #include #include #include #include #include using namespace std; int stringToInteger(string a){ stringstream ss; ss<>b; return b; } vector openClassificationFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; int i=1; while(!file.eof()){ string temp; file>>temp; data.push_back(stringToInteger(temp)); } file.close(); return data; } vector openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; int i=1; while(!file.eof()){ string temp; file>>temp; data.push_back(temp); } file.close(); for(int i=0;i > openFiles(const vector files){ vector > docs; for(int i=0;i t = openFile(files[i]); docs.push_back(t); } return docs; } void bayesian(vector > docs,vector c,vector d){ map wordFrequency;//每个单词出现的个数 map cWordProbability;//类别单词频率 map cTotalFrequency;//类别单词个数 map > cWordlTotalFrequency;//类别下单词个数 int totalWords=0; for(int i=0;i sn; for(int j=0;j::iterator isn; for(isn = sn.begin();isn!=sn.end();isn++){ cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second; } } int tw = wordFrequency.size(); map::iterator icWordProbability; for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){ cTotalFrequency[icWordProbability->first] = icWordProbability->second; cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords; } cout<<"Word Frequency:"<::iterator iwordFrequency; for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){ cout<first<<"\tFrequency:"<second< dtw;//待分类文档词频 for(int i=0;i > cp;//单词类别概率 map::iterator idtw; for(idtw=dtw.begin();idtw!=dtw.end();idtw++){ map cf; for(int j=0;jfirst] +1) / (cTotalFrequency[j] + wordFrequency.size()); cf[j] = p; cout<<"P("<first<<"|"< > docs; vector c = openClassificationFile("classification.txt"); vector files; files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt"); cout<<"训练文档集:"< d; cout<<"待分类文档:"< ';