提取共现词矩阵
请问大神们,怎么用C++提取共现矩阵啊,我调试了程序,出来的结果都是0次或者1次.这是我的程序,是统计一个词与他前后两个词的共现:
#include "co_occurWord.h"
#include <fstream>
#include <iomanip>
using namespace std;
const int WORD_LEN = 10;
CWordProcessor::CWordProcessor()
{
m_inputWord.clear();
m_wordMap.clear();
}
CWordProcessor::~CWordProcessor()
{
}
void CWordProcessor::ProcessWords()
{
ReadInputFile();
GetCo_OccurWord();
WriteResultToFile();
}
void CWordProcessor::ReadInputFile()
{
ifstream inFile;
inFile.open("input.txt");
if(!inFile.is_open())
{
return;
}
vector<string> fileWords;
set<string> wordsOfNoRepeat;
while(!inFile.eof())
{
string word;
inFile >> word;
if(word == "。")
{
m_inputWord.push_back(fileWords);
wordsOfNoRepeat.clear();
fileWords.clear();
continue;
}
else
{
int tmp = wordsOfNoRepeat.size();
wordsOfNoRepeat.insert(word);
if(wordsOfNoRepeat.size() > tmp)
{
fileWords.push_back(word);
}
}
}
inFile.close();
}
/*
以5个为一个窗口处理 A C B D F E G N M
采用的方法是以5个单词最中间的一个为基准向前推进,举例说明:
首先处理 A C B,然后处理 C和D;
然后处理 B D F,然后处理 D和E;
然后处理 F E G, 然后处理 E 和N
这样循环下去可以处理所有单词之间的共现关系
*/
void CWordProcessor::GetCo_OccurWord()
{
//process every sentence
for(int i = 0; i < m_inputWord.size(); i++)
{
vector<string> sentence = m_inputWord[i];
//process every word
int index = 0;
for(; index < sentence.size() - 1; index++)
{
process2word(sentence[index], sentence[index+1]);
}
//while(index < sentence.size())
//{
// if(index + 3 < sentence.size())
// {
// process2word(sentence[index], sentence[index+1]);
// process2word(sentence[index], sentence[index+2]);
// process2word(sentence[index+1], sentence[index+2]);
// process2word(sentence[index+1], sentence[index+3]);
// index += 2;
// }
// else if(index + 2 < sentence.size())
// {
// process2word(sentence[index], sentence[index+1]);
// process2word(sentence[index], sentence[index+2]);
// process2word(sentence[index+1], sentence[index+2]);
// break;
// }
// else if(index + 1 < sentence.size())
// {
// process2word(sentence[index], sentence[index+1]);
// break;
// }
//}
}
}
//处理两个单词之间的关系,首先对两个单词排序,这样防止重复处理
void CWordProcessor::process2word( string theWord, string nextWord )
{
map<string, MyMap>::iterator mapIter;
MyMap::iterator myIter;
if(theWord > nextWord)
{
string tmp = theWord;
theWord = nextWord;
nextWord = tmp;
}
mapIter = m_wordMap.find(theWord);
if(mapIter == m_wordMap.end())
{
MyMap newMap;
newMap[nextWord] = 1;
m_wordMap[theWord] = newMap;
}
else
{
MyMap &existMap = mapIter->second;
myIter = existMap.find(nextWord);
if(myIter == existMap.end())
{
existMap[nextWord] = 1;
}
else
{
myIter->second++;
}
}
}
void CWordProcessor::WriteResultToFile()
{
set<string> fileWords;
for(int i = 0; i < m_inputWord.size(); i++)
{
for(int j = 0; j < m_inputWord[i].size(); j++)
{
fileWords.insert(m_inputWord[i][j]);
}
}
vector<string> wholeWords;
set<string>::iterator iter = fileWords.begin();
for(; iter != fileWords.end(); iter++)
{
wholeWords.push_back(*iter);
}
ofstream outFile;
outFile.open("result.txt");
outFile << setw(WORD_LEN) << " ";
for(int i = 0; i < wholeWords.size(); i++)
{
outFile << setw(WORD_LEN) << wholeWords[i];
}
outFile << endl;
map<string, MyMap>::iterator mapIter = m_wordMap.begin();
MyMap::iterator myIter;
for(int i = 0; i < wholeWords.size(); i++)
{
outFile << setw(WORD_LEN) << wholeWords[i];
for(int j = 0; j < wholeWords.size(); j++)
{
if(i == j)
{
outFile << setw(WORD_LEN) << 0;
}
else
{
int t1 = (i < j) ? i : j;
int t2 = (i < j) ? j : i;
string word_1 = wholeWords[t1];
string word_2 = wholeWords[t2];
mapIter = m_wordMap.find(word_1);
if(mapIter == m_wordMap.end())
{
outFile << setw(WORD_LEN) << 0;
break;
}
MyMap &existMap = mapIter->second;
myIter = existMap.find(word_2);
if(myIter == existMap.end())
{
outFile << setw(WORD_LEN) << 0;
}
else
{
outFile << setw(WORD_LEN) << myIter->second;
}
}
}
outFile << endl;
}
outFile.close();
}