Naive Bayesian文本分类器

最后更新于：2022-04-01 20:31:35

贝叶斯学习方法中实用性很高的一种为朴素贝叶斯学习期，常被称为朴素贝叶斯分类器。在某些领域中与神经网络和决策树学习相当。虽然朴素贝叶斯分类器忽略单词间的依赖关系，即假设所有单词是条件独立的，但朴素贝叶斯分类在实际应用中有很出色的表现。朴素贝叶斯文本分类算法伪代码： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6f3e70e.jpg) 朴素贝叶斯文本分类算法流程： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6f5f2c7.jpg) 通过计算训练集中每个类别的概率与不同类别下每个单词的概率，然后利用朴素贝叶斯公式计算新文档被分类为各个类别的概率，最终输出概率最大的类别。 C++源码： ~~~ /* Bayesian classifier for document classifiaction 15S103182 Ethan 2015.12.27 */ #include #include #include #include #include #include #include using namespace std; int stringToInteger(string a){ stringstream ss; ss<>b; return b; } vector openClassificationFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; int i=1; while(!file.eof()){ string temp; file>>temp; data.push_back(stringToInteger(temp)); } file.close(); return data; } vector openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; int i=1; while(!file.eof()){ string temp; file>>temp; data.push_back(temp); } file.close(); for(int i=0;i > openFiles(const vector files){ vector > docs; for(int i=0;i t = openFile(files[i]); docs.push_back(t); } return docs; } void bayesian(vector > docs,vector c,vector d){ map wordFrequency;//每个单词出现的个数 map cWordProbability;//类别单词频率 map cTotalFrequency;//类别单词个数 map > cWordlTotalFrequency;//类别下单词个数 int totalWords=0; for(int i=0;i sn; for(int j=0;j::iterator isn; for(isn = sn.begin();isn!=sn.end();isn++){ cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second; } } int tw = wordFrequency.size(); map::iterator icWordProbability; for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){ cTotalFrequency[icWordProbability->first] = icWordProbability->second; cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords; } cout<<"Word Frequency:"<::iterator iwordFrequency; for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){ cout<first<<"\tFrequency:"<second< dtw;//待分类文档词频 for(int i=0;i > cp;//单词类别概率 map::iterator idtw; for(idtw=dtw.begin();idtw!=dtw.end();idtw++){ map cf; for(int j=0;jfirst] +1) / (cTotalFrequency[j] + wordFrequency.size()); cf[j] = p; cout<<"P("<first<<"|"< > docs; vector c = openClassificationFile("classification.txt"); vector files; files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt"); cout<<"训练文档集："< d; cout<<"待分类文档："< ';

Best-First求解八数码问题

最后更新于：2022-04-01 20:31:33

问题描述：将8-魔方的起始状态在有限移动步骤内，转换为目标状态，如下图所示。 ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6f03ac3.jpg) 算法思路：通过优先队列实现相似度比对，当相似度为9，即当前状态与目标状态一致时，返回当前路径。源码： ~~~ /* 8-cube problem-Best-First algorithm 15S103182 Ethan 2015.12.21 */ #include #include #include #include #include using namespace std ; class cube{ public: vector > m; int similarity; int x;// 0::i int y;// 0::j vector path;//up down left right cube(vector > a,int b):m(a),similarity(b){} void pos(){//0-position for(int i=0;i>score; return score; } cube openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); vector > data; while(!file.eof()){ vector tv; char temp[8]; file.getline(temp,6); tv.push_back(temp[0]-48); tv.push_back(temp[2]-48); tv.push_back(temp[4]-48); data.push_back(tv) ; } file.close(); cube res(data,0); return res; } void output(vector > m){ for(int i=0;i move(const cube& v){ vector res; if(v.x>0){ cube a(v); a.m[a.x][a.y] = a.m[a.x-1][a.y]; a.m[a.x-1][a.y] = 0; a.x--; a.path.push_back("down"); res.push_back(a); } if(v.x<2){ cube a(v); a.m[a.x][a.y] = a.m[a.x+1][a.y]; a.m[a.x+1][a.y] = 0; a.x++; a.path.push_back("up"); res.push_back(a); } if(v.y>0){ cube a(v); a.m[a.x][a.y] = a.m[a.x][a.y-1]; a.m[a.x][a.y-1] = 0; a.y--; a.path.push_back("right"); res.push_back(a); } if(v.y<2){ cube a(v); a.m[a.x][a.y] = a.m[a.x][a.y+1]; a.m[a.x][a.y+1] = 0; a.y++; a.path.push_back("left"); res.push_back(a); } return res; } void eightCube(cube orginal,cube target){ priority_queue,less > pq; vector copy; orginal.similarity = calSimilarity(orginal,target); orginal.pos(); pq.push(orginal); copy.push_back(orginal); while(!pq.empty()){//best-first cube v = pq.top(); pq.pop(); vector ms = move(v); for(int i=0;i=0;j--){ if(calSimilarity(ms[i],copy[j])==9){ flag = 0; break; } } if(flag){ if(ms[i].similarity==9){ cout<<"Path:"< "; for(int k=0;k "; } cout<<"target"< ';

爬山法、分支限界法求解哈密顿环问题

最后更新于：2022-04-01 20:31:31

问题描述: （1）哈密顿环问题：输入是一个无向连通图G=(V,E)；如果G中存在哈密顿环则输出该环。（2）最小哈密顿环问题：输入是一个无向连通图G=(V,E)，每个节点都没有到自身的边，每对节点间都有一条非负加权边；输出一个权值代价和最小的哈密顿环。注意：事实上输入图是一个完全图，因此哈密顿环是一定存在的。实现哈密顿环搜索算法（1）哈密顿环问题：(a)实现基于树的基本搜索算法(BFS,DFS) (b)爬山法（2）最小哈密顿环问题：(a)实现求解最小哈密顿环问题的分支界限算法。 1.DFS 算法的主要步骤: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e7351c.jpg) 2.BFS 算法的主要步骤: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e933e7.jpg) 3.ClimbingHill 算法的主要步骤: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6eadff0.jpg) 4.BranchBound 算法的主要原理: ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6ec7b59.jpg) 源码： Grap.h ~~~ /* *Tree Search Strategy 15S103182 Ethan *2015.12.1 */ #include #include #include #include #include #include using namespace std; template //E为图中边的权值的数据类型 class Graph { private: int maxVertices; //图中最大顶点数 E **matrix;//邻接矩阵 public: E maxWeight; //代表无穷大的值 Graph(int sz);//创建SZ大小的基于邻接矩阵的图 ~Graph();//析构函数 int NumberOfVertices() { return maxVertices; //返回最大顶点数 } E getWeight(int v1, int v2); //取边（v1,v2)上的权值 int getFirstNeighbor(int v);//取顶点v的第一个邻接顶点 int getNextNeighbor(int v, int w);//取顶点v的邻接顶点W的下一个邻接顶点 int Init(istream &in);//根据用户输入，获得图的邻接矩阵 int RandInitN();//随机初始化图(无向图) int RandInit();//随机初始化图(完全无向图) int output(ostream &out); //输出图的矩阵到文件 int output(); //输出图的矩阵到控制台 }; template Graph::Graph(int sz) { //创建SZ大小的基于邻接矩阵的图 maxWeight = std::numeric_limits::max(); maxVertices = sz; matrix = new E *[sz]; for (int i = 0; i Graph::~Graph() { for (int i = 0; i E Graph::getWeight(int v1, int v2) { //取边（v1,v2)上的权值 if (v1 >= 0 && v1= 0 && v2 int Graph::getFirstNeighbor(int v) { //取顶点v的第一个邻接顶点 if (!(v >= 0 && v int Graph::getNextNeighbor(int v, int w) { //取顶点v的邻接顶点W的下一个邻接顶点 if (!(v >= 0 && v= 0 && w int Graph::Init(istream &fin) { //根据输入文件，获得图的邻接矩阵 int v1, v2; E edge; while (fin >> v1 >> v2 >> edge) { if (v1 >= 0 && v1= 0 && v2 int Graph::RandInitN() { //随机初始化图(无向图,非完全) for (int i = 0; i int Graph::RandInit() { //随机初始化图(无向完全图) for (int i = 0; i int Graph::output(ostream &out) { //输出图的矩阵 for (int i = 0; i int Graph::output() { //输出图的矩阵 for (int i = 0; i #include #include #include #include #include #include #include using namespace std; template struct NODE { int dep; //表示该结点在搜索树的第几层 int *vertices; //该节点包含的各个顶点 E length; //从根到当前结点已经走过的路径长度 NODE(int depth) { dep = depth; vertices = new int[dep]; }; void cpy(int *&des) { for (int i = 0; i int dfs(int start, Graph &myGraph) { //deepFirst 判断图中是否存在哈密顿回路 stack myStack; myStack.push(start); int numVertices = myGraph.NumberOfVertices(); bool *visited = new bool[numVertices]; memset(visited, false, numVertices); int v; int w = -1; while (!myStack.empty()) { //栈不为空 v = myStack.top(); visited[v] = true; if (w == -1) { w = myGraph.getFirstNeighbor(v); } else { w = myGraph.getNextNeighbor(v, w); } for (; w != -1 && visited[w] == true; w = myGraph.getNextNeighbor(v, w)) {} if (w == -1) { //未找到可行的下一个顶点,子节点都在栈中 myStack.pop(); //回溯 w = v; visited[v] = false; } else { //找到可行的下一个顶点 myStack.push(w); //放入栈中 if (myStack.size() == numVertices) { //走过所有的顶点 if (myGraph.getWeight(start, w) == std::numeric_limits::max()) { //判断最后一个顶点有没有回到起点的边 myStack.pop(); visited[w] = false; } else { //成功找到回路 return 1; } } else { w = -1; } } } return 0; } template int ch(int start, Graph &myGraph) { //climbingHill 爬山法判断图中是否存在哈密顿回路 stack myStack; myStack.push(start); int numVertices = myGraph.NumberOfVertices(); bool *visited = new bool[numVertices]; memset(visited, false, numVertices); int v; int w = -1; while (!myStack.empty()) { //栈不为空 v = myStack.top(); visited[v] = true; if (w == -1) { w = myGraph.getFirstNeighbor(v); } else { w = myGraph.getNextNeighbor(v, w); } for (; w != -1 && visited[w] == true; w = myGraph.getNextNeighbor(v, w)) {} int greedy = w;//贪心选择子顶点 for (; w != -1 && visited[w] == false; w = myGraph.getNextNeighbor(v, w)) { if(myGraph.getWeight(v, w) < myGraph.getWeight(v, greedy)) { greedy = w; } } w = greedy; if (w == -1) { //未找到可行的下一个顶点 myStack.pop(); //回溯 w = v; visited[v] = false; } else { //找到可行的下一个顶点 myStack.push(w); //放入栈中 if (myStack.size() == numVertices) { //走过所有的顶点 if (myGraph.getWeight(start, w) == std::numeric_limits::max()) { //判断最后一个顶点有没有回到起点的边 myStack.pop(); visited[w] = false; } else { //成功找到回路 return 1; } } else { w = -1; } } } return 0; } template int climbingHill(int start, Graph &myGraph, ostream & fout) { //算法解决图的最小哈密顿回路问题 v为回路的起点和终点 stack myStack; myStack.push(start); int numVertices = myGraph.NumberOfVertices(); bool *visited = new bool[numVertices]; memset(visited, false, numVertices); int v; int w = -1; while (!myStack.empty()) { //栈不为空 v = myStack.top(); visited[v] = true; if (w == -1) { w = myGraph.getFirstNeighbor(v); } else { w = myGraph.getNextNeighbor(v, w); } for (; w != -1 && visited[w] == true; w = myGraph.getNextNeighbor(v, w)) {} int greedy = w;//贪心选择子顶点 for (; w != -1 && visited[w] == false; w = myGraph.getNextNeighbor(v, w)) { if(myGraph.getWeight(v, w) < myGraph.getWeight(v, greedy)) { greedy = w; } } w = greedy; if (w == -1) { //未找到可行的下一个顶点 myStack.pop(); //回溯 w = v; visited[v] = false; } else { //找到可行的下一个顶点 myStack.push(w); //放入栈中 if (myStack.size() == numVertices) { //走过所有的顶点 if (myGraph.getWeight(start, w) == std::numeric_limits::max()) { //判断最后一个顶点有没有回到起点的边 myStack.pop(); visited[w] = false; } else { //成功找到回路 stack temp; while (!myStack.empty()) { int n = myStack.top(); temp.push(n); myStack.pop(); } fout << "哈密顿回路 : "; E distance = 0; int n = temp.top(); myStack.push(n); temp.pop(); int last = n; fout << n << "--"; while (!temp.empty()) { n = temp.top(); myStack.push(n); temp.pop(); distance += myGraph.getWeight(last, n); last = n; fout << n << "--"; } fout << start << endl; distance += myGraph.getWeight(last, start); fout << "总长度为:" << distance << endl; return distance; } } else { w = -1; } } } return std::numeric_limits::max(); } template int bfs(int start, Graph & myGraph) { //broadFirst 判断图中是否存在哈密顿回路 stack > myStack; //队列 int s = myGraph.getFirstNeighbor(start); for (s = myGraph.getNextNeighbor(start, s); s != -1; s = myGraph.getNextNeighbor(start, s)) { NODE n(2); n.vertices[0] = start; n.vertices[1] = s; n.length = myGraph.getWeight(start, s); myStack.push(n); } while (!myStack.empty()) { //队列不为空 NODE n = myStack.top(); myStack.pop(); int v = n.vertices[n.dep - 1]; if (n.dep + 1 == myGraph.NumberOfVertices()) { //到了最后一层判断是不是哈密顿回路 int w; for (w = myGraph.getFirstNeighbor(v); w != -1; w = myGraph.getNextNeighbor(v, w)) { if (n.find(w) == false) break; } if (w != -1) { if (myGraph.getWeight(w, start)::max()) { return 1; } } } for (int w = myGraph.getFirstNeighbor(v); w != -1; w = myGraph.getNextNeighbor(v, w)) { if (n.find(w) == false) { NODE ne(n.dep + 1); ne.length = n.length + myGraph.getWeight(v, w); n.cpy(ne.vertices); ne.vertices[ne.dep - 1] = w; myStack.push(ne); } } } return 0; } //分支限界法求解最短哈密顿回路问题 template int branchBound(int start, Graph & myGraph, ostream & fout) { stack > myStack; //队列 E minDistance = climbingHill(start,myGraph,fout);//爬山法获取首界限 int s = start; for (s = myGraph.getFirstNeighbor(start); s != -1; s = myGraph.getNextNeighbor(start, s)) {//首次分支 NODE n(2); n.vertices[0] = start; n.vertices[1] = s; n.length = myGraph.getWeight(start, s); myStack.push(n); } while (!myStack.empty()) { //队列不为空 NODE n = myStack.top(); myStack.pop(); int v = n.vertices[n.dep - 1]; if (n.dep + 1 == myGraph.NumberOfVertices()) { //到了最后一层判断是不是哈密顿回路 int w; for (w = myGraph.getFirstNeighbor(v); w != -1; w = myGraph.getNextNeighbor(v, w)) { if (n.find(w) == false)//确定最后一个节点 break; } if (w != -1) { if (myGraph.getWeight(w, start)::max()) { //如果找到且存在路径 E tempDistance = n.length + myGraph.getWeight(v, w) + myGraph.getWeight(w, start); if (minDistance>tempDistance) { // 形成回路 fout << "哈密顿回路为:"; // cout << "哈密顿回路为:"; for (int i = 0; i ne(n.dep + 1); ne.length = n.length + myGraph.getWeight(v, w); if (ne.length myGraph(i); Graph myGraphN(i); myGraph.RandInit();//随机初始化完全无向图 myGraphN.RandInitN();//随机初始化图 int a = clock(); cout<<"deepFirst:"< myGraphN(4); // ifstream fin("input.txt"); // myGraphN.Init(fin); // cout<<"deepFirst:"< ';

聚类算法-Hierarchical(MIN)-C++

最后更新于：2022-04-01 20:31:28

程序流程图： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e43044.jpg) Hierarchical(MIN)核心功能函数，采用vector >::dTable存储两点之间的距离。计算每两个point间的距离并保存到distance table中；判断是否达到需要聚类的cluster数量，若是，将point信息写入clustering文件，程序结束。否则，合并两个具有最小距离的point，将两个point中的所有node全部加入到一个point中，删除拷贝node数组后的多余point，跟新dataset数组。然后更新distance table，删除被合并point对应的distance table中行与列，进入下一步循环。 ~~~ /* K-means Algorithm 15S103182 Ethan */ #include #include #include #include #include #include #include #include using namespace std; class node{ public: float x; float y; node(float a,float b){ x = a; y = b; } }; class point{ public: float x; float y; vector nds; point (float a,float b){ x = a; y = b; node t(a,b); nds.push_back(t); } }; float stringToFloat(string i){ stringstream sf; float score=0; sf<>score; return score; } vector openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; while(!file.eof()){ string temp; file>>temp; int split = temp.find(',',0); point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1))); data.push_back(p); } file.close(); cout<<"Read data successfully!"< dataset,int clusters){ //Similarity table：distance table vector > dTable; for(int i=0;i temp; for(int j=0;ji) temp.push_back(squareDistance(dataset[i],dataset[j])); else if(jclusters){ //Merge two closest clusters float minDt =INT_MAX; int mi,mj; for(int i=0;i::iterator imj = dataset.begin(); imj += mj; dataset.erase(imj); //Update the dTable for(int j=0;jdTable[mj][j]){ dTable[mi][j] = dTable[mj][j]; dTable[j][mi] = dTable[mi][j]; } } //rm vector >::iterator ii = dTable.begin(); ii += mj; dTable.erase(ii); for(int i=0;i::iterator ij = dTable[i].begin(); ij += mj; dTable[i].erase(ij); } } fstream clustering; clustering.open("clustering.txt",ios::out); for(int i=0;i dataset = openFile("dataset3.txt"); HierarchicalClustering(dataset,15); return 0; } ~~~ 数据文件格式：（x，y）运行结果格式：（x，y，cluster）具体文件格式见DBSCAN篇：http://blog.csdn.net/k76853/article/details/50440182 图形化展现： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e5a775.jpg) 总结： Hierarchical(MIN)算法能够很好处理非圆形状的数据。但对于噪音和离群点，Hierarchical(MIN)算法具有局限性。由于层次聚类具有不可恢复性，一旦聚类，就难以撤销聚类操作，刚开始编码的时候走了不少弯路，后来果断决定用数组存储node信息，方便cluster合并。对于近邻表的更新，也遇到一点小麻烦，由于二级指针对于行列的删除比较复杂，果断采用动态数组vector来存储距离信息，二级vector对于行的删除非常直接简单，对于列的删除也只需O(n)操作，比较简洁高效。

';

聚类算法-K-means-C++实现

最后更新于：2022-04-01 20:31:26

程序流程图： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e09c55.jpg) K-means核心功能函数，首先，随机选择K-中心点（中心点坐标为簇中所有点的x坐标的平均值，y坐标的平均值，该点用于记录位置，不属于原始数据集）；循环判断中心点是否不变，若是，将二维点对信息写入clustering文件，程序结束。否则，对于每个二维数据点，选择与其距离最近的中心点，将点cluster编号更新为中心点的cluster编号。然后对于K-簇，重新计算K-中心点，进入下一个循环判断。计算簇中心是否不变可以采用SSE方式，具体实现代码中已给出，或者直接循环运行多次（不推荐）。 ~~~ /* K-means Algorithm 15S103182 Ethan */ #include #include #include #include #include #include #include #include using namespace std; /* run this program using the console pauser or add your own getch, system("pause") or input loop */ typedef struct Point{ float x; float y; int cluster; Point (){} Point (float a,float b,int c){ x = a; y = b; cluster = c; } }point; float stringToFloat(string i){ stringstream sf; float score=0; sf<>score; return score; } vector openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); vector data; while(!file.eof()){ string temp; file>>temp; int split = temp.find(',',0); point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),0); data.push_back(p); } file.close(); return data; } float squareDistance(point a,point b){ return (a.x-b.x)*(a.x-b.x)+(a.y-b.y)*(a.y-b.y); } void k_means(vector dataset,int k){ vector centroid; int n=1; int len = dataset.size(); srand((int)time(0)); //random select centroids while(n<=k){ int cen = (float)rand()/(RAND_MAX+1)*len; point cp(dataset[cen].x,dataset[cen].y,n); centroid.push_back(cp); n++; } for(int i=0;i=1){ // while(time){ oSSE = nSSE; nSSE = 0; //update cluster for all the points for(int i=0;i dataset = openFile("dataset3.txt"); k_means(dataset,7); return 0; } ~~~ 数据文件格式：（x，y）运行结果格式：（x，y，cluster）具体文件格式见DBSCAN篇：http://blog.csdn.net/k76853/article/details/50440182 图形化展现： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6e26247.jpg) 总结： K-means算法运行速度快，实现简便。但K-means算法对具有变化大小，变化密度，非圆形状等特点的数据具有局限性。解决方法是增加K的大小，增加cluster数量，使得数据的特征能够更加明显。对于数据初始中心点的选择，采用随机的方式可能无法产生理想的聚类，这时可以采用二分K-means方法，或层次聚类进行处理。

';

聚类算法-DBSCAN-C++实现

最后更新于：2022-04-01 20:31:24

程序流程图： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cfa19827.jpg) DBSCAN核心功能函数，计算每个point的eps范围内的point数量pts；对于所有pts >Minpts的point，记为Core point；对于所有的corepoint，将其eps范围内的core point下标添加到vector::corepts中；对于所有的corepoint，采用深度优先的方式遍历core point的所有cluster，使得相互连接的core point具有相同的cluster编号；计算所有pts < Minpts且在Core point范围内的，记为Borderpoint；将所有Borderpoint加入到任意一个关联的core point；剩余的point的为Noise point，文件写写入时忽略；将point信息写入clustering文件，程序结束。 ~~~ /* DBSCAN Algorithm 15S103182 Ethan */ #include #include #include #include #include #include #include #include #include using namespace std; class point{ public: float x; float y; int cluster=0; int pointType=1;//1 noise 2 border 3 core int pts=0;//points in MinPts vector corepts; int visited = 0; point (){} point (float a,float b,int c){ x = a; y = b; cluster = c; } }; float stringToFloat(string i){ stringstream sf; float score=0; sf<>score; return score; } vector openFile(const char* dataset){ fstream file; file.open(dataset,ios::in); if(!file) { cout <<"Open File Failed!" < a; return a; } vector data; int i=1; while(!file.eof()){ string temp; file>>temp; int split = temp.find(',',0); point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),i++); data.push_back(p); } file.close(); cout<<"successful!"< dataset,float Eps,int MinPts){ int len = dataset.size(); //calculate pts cout<<"calculate pts"< corePoint; for(int i=0;i=MinPts) { dataset[i].pointType = 3; corePoint.push_back(dataset[i]); } } cout<<"joint core point"< ps; if(corePoint[i].visited == 1) continue; ps.push(&corePoint[i]); point *v; while(!ps.empty()){ v = ps.top(); v->visited = 1; ps.pop(); for(int j=0;jcorepts.size();j++){ if(corePoint[v->corepts[j]].visited==1) continue; corePoint[v->corepts[j]].cluster = corePoint[i].cluster; corePoint[v->corepts[j]].visited = 1; ps.push(&corePoint[v->corepts[j]]); } } } cout<<"border point,joint border point to core point"< dataset = openFile("dataset3.txt"); DBSCAN(dataset,1.5,2); return 0; } ~~~ 数据文件格式：（x，y） ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cfa2ea61.jpg) 运行结果格式：（x，y，cluster） ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d363a488.jpg) 图形化展现：特殊形状： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d364d035.jpg) 桥梁： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d3671f34.jpg) 变化密度： ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187d6ddaa44.jpg) 总结： DBSCAN算法能够很好处理不同形状与大小的数据，并且抗噪音数据。但对于变化的密度，DBSCAN算法具有局限性。在实现Core point连接时，遇到了一点小小的麻烦，很难将相互连接的core point的cluster编号统一，后来通过给每个core point增加一个数组用于记录相连core point的下标信息，并采用深度优先进行遍历的方式，不仅提高了计算速度，同时也保证了准确性。对于实验数据结果，如果不对其进行图形化展现，很难看出聚类的效果，采用WPF(C#)技术对数据点进行处理，对不同cluster编号的点，赋予不同的颜色，进行图形展现。

';

Apriori算法 (Introduction to data mining)

最后更新于：2022-04-01 20:31:21

前置概念： **Support**: 支持度 s(X->Y) =(XUY)/N; **Confidence**: 置信度 c(X->Y) =(XUY)/(X); **Frequent ItemSet**: 频繁项集 Support >minSup; **Apriori Principle**: 如果一个项集是频繁的，那它所有的子项集也都是频繁的。 **Frequent Itemset Generation in the AprioriAlgorithm:** Apriori算法是第一个指出使用基于支持度剪枝策略的关联规则挖掘算法，系统地控制候选项集的指数增长。 Ck代表k候选项集, Fk代表频繁k项集 1 算法首先遍历一遍数据集，检测每项的支持度，获取频繁1-项集。Steps (1-2) 2 接下来，循环使用频繁(k-1)-项集派生k-候选项集。Step (5) 3 遍历数据集计算候选项集支持度Steps (6-10) 4 计算支持度后，消除非频繁项集Step (12) 5 当没有新的频繁项集产生的时候，算法结束Step(13) **Frequent itemset generation of the AprioriAlgorithm.** ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cf96b84e.jpg) **Rule generation:** 若果一个规则X->Y-X不满足置信度阀值，那么所有的X’->Y-X’也不满足阀值, 其中X’⊂ X. **Rule generation of the Apriori algorithm.** ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cf981aac.jpg) **Procedure ap-genrules(fk, Hm).** ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cf994462.jpg) **总结：** **核心思想: 基于两阶段频繁项集，挖掘关联规则** **算法优点: 简单、易理解、数据要求低** **算法缺点: I/O负载大，产生过多的候选项集** **Apriori例题（Introduction to data mining）：** ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cf9b90df.jpg) ![](https://docs.gechiui.com/gc-content/uploads/sites/kancloud/2016-04-21_57187cf9e7291.jpg) (b)16/32=50% (c)11/32=34.4% (d)5/32=15.6%

';

顶点覆盖问题

最后更新于：2022-04-01 20:31:19

输入：无向图G(V，E) 输出：C属于V，C中点数最小，满足E中任意一条边中的两个顶点至少有一个在C中 APPROX_Vertex_Cover(G) 1 C=空集 2 E' = E 3 While E' != 空集 Do 4 任选{u,v}属于E' 5 C = C U {u,v} 6 删除E'中所有与u,v相连的边 7 Return C Ratio Bound(近似比) 设A为算法第四步中选中的边集，由算法第六步可知，A中无邻接边第五步中每次增加两个节点到C，因此|C| = 2|A| 设C*是优化解，则C*必须覆盖A 由于A中无邻接边，C*至少包含A中每条边的一个节点，于是|A|<=|C*|,|C| = 2|A| <= 2|C*| 即|C| / |C*| <= 2. 因此，算法近似比为2.

';

判断两多项式之积是否等于另一多项式

最后更新于：2022-04-01 20:31:17

问题描述：判断p(x),q(x)之积是否等于r(x),p,q,r分别为m,n,l 阶多项式 Random_polynomial(p(x),q(x),r(x),m,n,l) 输入：随机选取X[1:k] 输出：p(x)*q(x)是否等于r(x) 1 K =max{m+n,l} 2 For k=1 to K do 3 X[k] = random(real) //no repeat 4 For k=1 to K do 5 If(p(X[k])* q(X[k])!= r(X[k])) then 6 Return false 7 Return true 获得正确解的概率：若p(x)*q(x)与r(x)阶数相同成立，则对任意的k成立，输出正确解；若不成立，除非找到p(x)*q(x)-r(x)=0的k个根，否则等式一定不成立。设实数集合大小为S，则找到k个根的概率为max{m+n,l}/S,因此一定为错误解的概率为max{m+n,l}/S。时间复杂度：O(max{m+n,l}) 综上所述，若算法返回false则一定位正确解；若放回true，则正确解的概率为1-max{m+n,l}/S。由于算法并不能总获得问题的正确解，显然该随机算法为蒙特卡洛算法。

';

随机算法

最后更新于：2022-04-01 20:31:14

### 1. 数值随机算法通过随机点数求解圆周率，定积分等问题。主要利用大数定律与强大数定律。 ### 2. 拉斯维加斯随机算法(Las Vegas) 通过随机选择法求解集合S中的第k小的数算法得到的解一定是正确解，但算法也可能不获得问题的解，满足以上条件即为拉斯维加斯算法。 ### 3. 舍伍德随机算法(Sherwood) 随机快速排序舍伍德算法能够获得问题的正确解，它的精髓在于消除算法的最坏情况与特定实例之间的关联（例如，快速排序最坏时间复杂度为O(n方)） 3. 蒙特卡洛随机算法(Monte Carlo) a算法并不总能获得问题的正确解 b算法以较高的概率获得正确解 c不存在有效过程判定算法输出的解是否为问题的正确解满足以上三条性质的算法称为蒙特卡洛算法。

';

并行机器最短调度问题

最后更新于：2022-04-01 20:31:12

输入:数组t，存储n个任务的执行时间 m台完全一样的机器输出:使任务在m台机器并行执行时间最短的一个调度策略基于贪心选择:选择具有最短任务队列的机器。 ~~~ #include #include using namespace std; int minTask(int *t,int n){ int tmp = t[0]; int min = 0; for(int i=0;it[i]){ tmp = t[i]; min = i; } } return min; } void makeSpanScheduling(int *t,int n,int num){ int *T = new int[num]; vector > M; for(int i=0;i tmp; M.push_back(tmp); } for(int i=0;i ';

求数组中的逆序对

最后更新于：2022-04-01 20:31:10

~~~ #include using namespace std; int count = 0; void merge(int *a,int p,int q,int r){ int n1 = q-p+1; int n2 = r-q; int *m = new int[n1]; int *n = new int[n2]; for(int i=0;i ';

树搜索策略

最后更新于：2022-04-01 20:31:08

## 1. 深度优先维护栈将根节点放入栈循环弹出栈顶元素，然后将栈顶元素的子节点放入栈中当栈顶元素满足约束条件，对其进行处理。当栈空时，搜索结束 ## 2. 广度优先维护队列将根节点放入队列循环弹出队首元素，然后将队首的子节点放入队列中当队首元素满足约束条件，对其进行处理。当队空时，搜索结束 ## 3. 爬山法深度优先+贪心将栈首的子节点，采用贪心策略放入栈顶 ##4. Best First 深度优先+广度优先维护堆将跟节点放入堆将兄弟节点与孩子节点中的最优节点放入堆中若堆首节点满足条件，对其进行处理。堆空，搜索结束 ## 5. 分支限界法剪枝函数+广度优先广度优先跑一次，找出可行解代价对接下来的顶点，判断其代价是否优于可行解，若是，继续，否则孩子节点不予处理 ## 6. A*算法 Best First + 代价函数

';

01背包-近似算法

最后更新于：2022-04-01 20:31:05

~~~ void dp2(int *w, int *v, int n, int c){cout<<"dp2:"<0;i--){ if(m[n][i]<=c&&m[n][i]!=-1){ maxV = i; break; } } for(int i=n;i>0;i--){ if(m[i][maxV]==m[i-1][maxV]) x[i] = 0; else { x[i] = 1; maxV -= v[i-1]; } } // for (int i = 1; i <= n; i++) cout << x[i] << "\t"; cout << endl; } void adp(int *w, int *v, int n, int c,int e){cout<<"adp:"< ';

动态规划算法的一般解题思路

最后更新于：2022-04-01 20:31:03

**1. 证明优化子结构** 对于问题的优化子结构，给出问题具有优化子结构的解代价，利用反证法，假设上解不是最优的，则存在另外一个解，其解优于上解，这与上解是最优的矛盾，于是该问题具有优化子结构。证明优化子结构问题主要利用反证法。 **2. 证明重复子问题** 给出问题的递归公式则重叠子问题鍀证。 **3. 递归的定义最优解的代价** 给出最有解的代价递归公式，利于代码编写。 **4. 自底向上计算最优解的代价** 一般利用二维矩阵求解代价，或一行一行计算代价，或按列计算代价，或按照对角线逐级计算代价。 **5. 构造最优解** 根据最有接的代价矩阵信息，编写函数构造最优解。

';

整数序列合并问题

最后更新于：2022-04-01 20:31:01

~~~ #include using namespace std; void show(int **s,int i,int j){ if(i==j) cout<<"A"<m[i][j]) { m[i][j] = p; s[i][j] = k; } } } } for(int i=0;i ';

01背包

最后更新于：2022-04-01 20:30:59

~~~ void dp(int *w, int *v, int n, int c){ int **m; m = new int*[n+1]; for (int i = 0; i= 2; i--){ for (int j = c; j >= 1; j--){ if (j - w[i - 1] >= 0){ int tmp = m[i + 1][j - w[i - 1]] + v[i - 1]; m[i][j] = m[i + 1][j]>tmp ? m[i + 1][j] : tmp; } else{ m[i][j] = m[i + 1][j]; } } } if (c >= w[0]) m[1][c] = m[2][c - w[0]] + v[0]; else m[1][c] = m[2][c]; // for (int i = 1; i <= n; i++) { // for (int j = 1; j <= c; j++) cout << m[i][j] << "\t"; // cout << endl; // } int *x; x = new int[n+1]; for (int i = 0; i ';

求n*n阶矩阵最大子矩阵阶数

最后更新于：2022-04-01 20:30:56

~~~ int MaxSubMatrixOrder(int **a,int n){ int result[n][n] = {a[0][0]}; for(int i=1;i=result[i-1][j]) result[i][j] = result[i-1][j]; else result[i][j] = result[i][j-1]; if(result[i-1][j-1]>0) result[i][j]++; if(max ';

斐波那契数列-台阶问题

最后更新于：2022-04-01 20:30:54

~~~ int fibonacciSequence(int n){ if(n<1) return 0; if(n<2) return 1; if(n<3) return 2; int a[n+1]; a[0]=1; a[1]=1; for(int i=2;i<=n;i++){ a[i] = a[i-1] + a[i-2]; } return a[n]; } ~~~

';

最长公共子序列

最后更新于：2022-04-01 20:30:52

~~~ int ** lcsLength(int *x,int *y,int m,int n){ int b[m][n],c[m][n]; for(int i=0;ic[i][j-1]){ c[i][j] = c[i-1][j]; b[i][j] = 2; } else { c[i][j] = c[i][j-1]; b[i][j] = -1; } } } int **a = new int*[m]; for(int i=0;i ';