这个问题怎么解决？（java 代码接力，部分代码已有。）

justmewei 2010-03-22 02:52:52

现有英语句子String[] docs = {
"Human machine interface for Lab ABC computer applications ",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user-perceived response time to error measurement",
"The generation of random, binary, unordered trees",
"The intersection graph of paths in trees.",
"Graph minors IV: Widths of trees and well-quasi-ordering",
"Graph minors: A survey "};
首先提取句子中的单词（该单词不是停用词列表中的单词，并且至少在两个句中出现），根据提取的单词构造一个单词-句子的矩阵（二维数组），类似结构

矩阵A=
sent0 sent1 sent2 sent3 sent4 sent5 sent6 sent7 sent8
huaman 1 0 0 1 0 0 0 0 0
interface 1 0 1 0 0 0 0 0 0
computer 1 1 0 0 0 0 0 0 0
survey 0 1 0 0 0 0 0 0 1
user 0 1 1 0 1 0 0 0 0
system 0 1 1 2 0 0 0 0 0
respones 0 1 0 0 1 0 0 0 0
time 0 1 0 0 1 0 0 0 0
eps 0 0 1 1 0 0 0 0 0
trees 0 0 0 0 0 1 1 1 0
graph 0 0 0 0 0 0 1 1 1
minors 0 0 0 0 0 0 0 1 1

(行代表的是句子，列代表的是单词，矩阵的元素aij表示第i个单词在第j个句子中出现的权重，权重用tfidf计算。实例中aij表示的只是第i个单词在第j个句子中出现的频数，最终希望得到的是用权重表示的矩阵)。
哪位大侠帮忙解决一下？不甚感激！

下面是已有的代码，希望在此代码基础上修改，实现上述要求。

public class Test {

class Term {

private String term;

private int df; //文档频率

private int tf; //词频

private double idf; //逆文档频率

private double tfIdf; //词条权重

//设置并取得 DF
public int getDf() {
return df;
}

public void setDf(int f) {
this.df = f;
}

//设置并取得term
public String getTerm() {
return term;
}

public void setTerm(String term) {
this.term = term;
}

//设置并取得IDF
public double getIdf() {
return idf;
}

public void setIdf(double idf) {
this.idf = idf;
}

//设置并取得 TF
public int getTf() {
return tf;
}

public void setTf(int tf) {
this.tf = tf;
}

//对象的比较
public boolean equals(Object t) {
if (this.term.equals(((Term) t).getTerm()))
return true;
else
return false;
}

//设置并取得TfIdf
public double getTfIdf() {
return tfIdf;
}

public void setTfIdf(double tfIdf) {
this.tfIdf = tfIdf;
}
}

class Document {

//文本单词列表（不包含停用词）
private ArrayList<Term> terms = new ArrayList<Term>();

//文本内容
private String content;

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

//查看term列表（所有单词）知否已经包含t
public boolean findTerm(Term t) {
if (terms.indexOf(t) == -1)
return false;
else
return true;
}

//计算文本中term的频率
public void computeTerms() {
// 将文档内容全部转化为小写字母，并以\\W(非单词字符)分割后，存入tokens字符串数组
String[] tokens = content.toLowerCase().split("\\s{1,}|(:|\\.|\\?|\\!|\\,)");

// 遍历tokens字符串数组
for (int i = 0; i < tokens.length; i++) {

if (!(tokens[i].equals("") || tokens[i].length() == 1))
//在stoplist中进行二分查找，返回索引<0,说明没有查找到，说明当前token不是停用词

//if (Arrays.binarySearch(Collection.stopList.toArray(),
//tokens[i]) == -1) {
if(!Collection.stopList.contains(tokens[i])){
Term t = new Term();
t.setTerm(tokens[i]);
t.setTf(1);

//获取单词t在terms中的索引，若不存在，索引值为-1
int index = terms.indexOf(t);
//若index<0,则说明term中不存在t，将t存入term列表
if (index == -1)
terms.add(t);
else {
//term已经存在，则相应地将其词频+1
Term tone = (terms.get(index));
tone.setTf(tone.getTf() + 1);
}
}
}

}

//输出文本中term及其相应的Tf（出现的次数）
public void printDocument() {
for (int j = 0; j < terms.size(); j++) {
Term t = terms.get(j);
System.out.print(t.getTerm() + "(" + t.getTf() + ")" + "\t");
}
System.out.println();
}

//设置并获取文本的term
public ArrayList<Term> getTerms() {
return terms;
}

public void setTerms(ArrayList<Term> terms) {
this.terms = terms;
}

}

class Collection {
//文本数组
private Document[] documents;
//
// private Document[] queryTerms;
//存放term（至少出现在两篇文本中）
private ArrayList<Term> termList = new ArrayList<Term>();
//停用词列表
static ArrayList<String> stopList = new ArrayList<String>();

//无参构造函数
public Collection() {

}

//带参构造函数
public Collection(String[] docs){
setDocuments(docs);
}

//获取停用词列表
public void loadStopList() throws IOException {
BufferedReader stopListRead = new BufferedReader(new FileReader(
"StopList\\stop_list.txt"));
Pattern p = Pattern.compile("\\b[A-Za-z]+\\b");
for (String str = ""; str != null; str = stopListRead.readLine()) {
if (str.length() == 0)
continue;//如果某一行的内容为空行，则扫描下一行,提高效率；
Matcher m = p.matcher(str.toLowerCase());
while (m.find())
stopList.add(m.group());
}
}

//计算文本及伪文档中单词的频数
public void process() throws Exception {
// 对停用词列表进行排序，以方便后面的二分查找过程

new Collection().loadStopList();

Arrays.sort(stopList.toArray());

//计算文本中单词的频数
for (int i = 0; i < documents.length; i++) {
documents[i].computeTerms();
}
}

//设置并返回文本对象数组
public Document[] getDocuments() {
return documents;
}

public void setDocuments(Document[] docs) {
this.documents = docs;
}

//设置文本对象
public void setDocuments(String[] docs) {
documents = new Document[docs.length];
for (int i = 0; i < documents.length; i++) {
documents[i] = new Document();
documents[i].setContent(docs[i]);
}
}

//计算IDF
public void computeIDF() {

//获取termList，保证term的DF>=2
for (int i = 0; i < documents.length; i++) {
for (int j = 0; j < documents[i].getTerms().size(); j++) {
// 提取每文本的terms中的单词
Term t = (Term) (documents[i].getTerms().get(j));
int index = termList.indexOf(t);
Term newT;
if (index == -1) {//index=-1 则说明termList中不含有t
newT = new Term();
newT.setTerm(t.getTerm());
} else {// 有，读取该单词
newT = (Term) (termList.get(index));
}
//
newT.setTf(newT.getTf() + t.getTf());
newT.setDf(newT.getDf() + 1);

//DF>=2 ,保证termList中的单词至少出现在两个文本中
// 2010-03-18 修改
/*if (newT.getDf() >= 2) {
termList.add(newT);
}*/

termList.add(newT);
}
}

//遍历termList，计算IDF
for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i);

// IDF=(N+1)/df N is the total number of the documents
t.setIdf(Math.log((documents.length+1) / (double) t.getDf()));
}
}

// 计算TfIdf=tf*idf
public void computeTfIdf() {
for (int i = 0; i < documents.length; i++) {
for (int j = 0; j < documents[i].getTerms().size(); j++) {
Term t = (Term) (documents[i].getTerms().get(j));
int index = termList.indexOf(t);
Term newT = (Term) (termList.get(index));
t.setIdf(newT.getIdf());
t.setTfIdf(1.0 + Math.log(t.getTf()) * newT.getIdf());
}
}
}

//输出termList中的单词，及其属性信息（Tf Df Idf TfIdf）
public void printCollectionTermList() {

for (int i = 0; i < termList.size(); i++) {
Term t = (Term) termList.get(i);

System.out.print(t.getTerm() + "\t" + t.getTf() + "\t" + t.getDf()
+ "\t" + t.getIdf() + "\t" + t.getTfIdf());

System.out.println();
}

}

public void printTermTfIdf() {
printTermTfIdfOfDocuments(documents);
// printTermTfIdfOfDocuments(queryTerms);
}

//输出所有文本中单词的TfIdf
public void printTermTfIdfOfDocuments(Document[] docs) {
for (int i = 0; i < docs.length; i++) {
for (int j = 0; j < docs[i].getTerms().size(); j++) {
Term t = (Term) (docs[i].getTerms().get(j));
System.out.println(t.getTerm() + '\t' + t.getTfIdf());
}
}
}

//输出所有文本中term及其相应的Tf（出现的次数）
public void printDocuments() {
for (int i = 0; i < documents.length; i++) {
documents[i].printDocument();
}
}

//输出所有伪文本中term及其相应的Tf（出现的次数）
/*public void printQuerys() {
for (int i = 0; i < queryTerms.length; i++) {
queryTerms[i].printDocument();
}
}*/

/*public Document[] getQueryTerms() {
return queryTerms;
}

public void setQueryTerms(String[] querys) {
queryTerms = new Document[querys.length];
for (int i = 0; i < queryTerms.length; i++) {
queryTerms[i] = new Document();
queryTerms[i].setContent(querys[i]);
}
}*/

}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub

String[] docs = {
"Human machine interface for Lab ABC computer applications ",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user-perceived response time to error measurement",
"The generation of random, binary, unordered trees",
"The intersection graph of paths in trees.",
"Graph minors IV: Widths of trees and well-quasi-ordering",
"Graph minors: A survey "};

//创建 allDocs 对象
Collection allDocs = new Collection(docs);

allDocs.process();
System.out.println("统计每个文档中单词的词频：");
//每篇文档里，每个单词出现的次数，即词频
allDocs.printDocuments();
System.out.println();
allDocs.computeIDF();
allDocs.computeTfIdf();
System.out.println("打印CollectionTermList：");
allDocs.printCollectionTermList();
System.out.println();
System.out.println("打印TermTfIdf：");
allDocs.printTermTfIdf();

}
}

...全文

178 10 打赏收藏转发到动态举报

写回复

用AI写文章

10 条回复

切换为时间正序

请发表友善的回复…

发表回复

justmewei 2010-03-23

打赏
举报

再丁一次

Dazzlingwinter 2010-03-23

打赏
举报

没什么难度,懒得写了...

「已注销」 2010-03-23

打赏
举报

你不是要通过这种方式找人帮你写代码吧？

doudou 2010-03-22

打赏
举报

太长了，第一感觉就不想细看了。

justmewei 2010-03-22

打赏
举报

不要沉呀等着呢

justmewei 2010-03-22

打赏
举报

[Quote=引用 3 楼 ivorytower 的回复:]
看不了这么长的……
[/Quote]
感兴趣的话，可以考到IDE环境下，运行一下。
原理并不难，只是为了后面调用，写成这种格式，所以看起来比较长。

justmewei 2010-03-22

打赏
举报

把代码再贴一下

public class Test {



class Term {



private String term;



private int df; //文档频率



private int tf; //词频



private double idf; //逆文档频率 



private double tfIdf; //词条权重



//设置并取得 DF

public int getDf() {

return df;

}



public void setDf(int f) {

this.df = f;

}



//设置并取得term

public String getTerm() {

return term;

}



public void setTerm(String term) {

this.term = term;

}



//设置并取得IDF

public double getIdf() {

return idf;

}



public void setIdf(double idf) {

this.idf = idf;

}



//设置并取得 TF

public int getTf() {

return tf;

}



public void setTf(int tf) {

this.tf = tf;

}



//对象的比较 

public boolean equals(Object t) {

if (this.term.equals(((Term) t).getTerm()))

return true;

else

return false;

}



//设置并取得TfIdf

public double getTfIdf() {

return tfIdf;

}



public void setTfIdf(double tfIdf) {

this.tfIdf = tfIdf;

}

}





class Document {



//文本单词列表（不包含停用词）

private ArrayList<Term> terms = new ArrayList<Term>();





//文本内容

private String content;



public String getContent() {

return content;

}



public void setContent(String content) {

this.content = content;

}



//查看term列表（所有单词）知否已经包含t

public boolean findTerm(Term t) {

if (terms.indexOf(t) == -1)

return false;

else

return true;

}



//计算文本中term的频率

public void computeTerms() {

// 将文档内容全部转化为小写字母，并以\\W(非单词字符)分割后，存入tokens字符串数组

String[] tokens = content.toLowerCase().split("\\s{1,}|(:|\\.|\\?|\\!|\\,)");



// 遍历tokens字符串数组

for (int i = 0; i < tokens.length; i++) {



if (!(tokens[i].equals("") || tokens[i].length() == 1))

//在stoplist中进行二分查找，返回索引<0,说明没有查找到，说明当前token不是停用词





//if (Arrays.binarySearch(Collection.stopList.toArray(),

//tokens[i]) == -1) {

if(!Collection.stopList.contains(tokens[i])){

Term t = new Term();

t.setTerm(tokens[i]);

t.setTf(1);



//获取单词t在terms中的索引，若不存在，索引值为-1

int index = terms.indexOf(t);

//若index<0,则说明term中不存在t，将t存入term列表

if (index == -1) 

terms.add(t);

else {

//term已经存在，则相应地将其词频+1

Term tone = (terms.get(index));

tone.setTf(tone.getTf() + 1);

}

}

}



}



//输出文本中term及其相应的Tf（出现的次数）

public void printDocument() {

for (int j = 0; j < terms.size(); j++) {

Term t = terms.get(j);

System.out.print(t.getTerm() + "(" + t.getTf() + ")" + "\t");

}

System.out.println();

}



//设置并获取文本的term

public ArrayList<Term> getTerms() {

return terms;

}



public void setTerms(ArrayList<Term> terms) {

this.terms = terms;

}



}



class Collection {

//文本数组

private Document[] documents;

//

// private Document[] queryTerms;

//存放term（至少出现在两篇文本中）

private ArrayList<Term> termList = new ArrayList<Term>();

//停用词列表

static ArrayList<String> stopList = new ArrayList<String>();





//无参构造函数

public Collection() {



}



//带参构造函数 

public Collection(String[] docs){

setDocuments(docs);

}





//获取停用词列表

public void loadStopList() throws IOException {

BufferedReader stopListRead = new BufferedReader(new FileReader(

"StopList\\stop_list.txt"));

Pattern p = Pattern.compile("\\b[A-Za-z]+\\b");

for (String str = ""; str != null; str = stopListRead.readLine()) {

if (str.length() == 0)

continue;//如果某一行的内容为空行，则扫描下一行,提高效率；

Matcher m = p.matcher(str.toLowerCase());

while (m.find())

stopList.add(m.group());

}

}



//计算文本及伪文档中单词的频数

public void process() throws Exception {

// 对停用词列表进行排序，以方便后面的二分查找过程



new Collection().loadStopList();



Arrays.sort(stopList.toArray());



//计算文本中单词的频数 

for (int i = 0; i < documents.length; i++) {

documents[i].computeTerms();

}

}



//设置并返回文本对象数组

public Document[] getDocuments() {

return documents;

}



public void setDocuments(Document[] docs) {

this.documents = docs;

}



//设置文本对象

public void setDocuments(String[] docs) {

documents = new Document[docs.length];

for (int i = 0; i < documents.length; i++) {

documents[i] = new Document();

documents[i].setContent(docs[i]);

}

}



//计算IDF

public void computeIDF() {



//获取termList，保证term的DF>=2

for (int i = 0; i < documents.length; i++) {

for (int j = 0; j < documents[i].getTerms().size(); j++) {

// 提取每文本的terms中的单词

Term t = (Term) (documents[i].getTerms().get(j));

int index = termList.indexOf(t);

Term newT;

if (index == -1) {//index=-1 则说明termList中不含有t

newT = new Term();

newT.setTerm(t.getTerm());

} else {// 有，读取该单词

newT = (Term) (termList.get(index));

}

//

newT.setTf(newT.getTf() + t.getTf());

newT.setDf(newT.getDf() + 1);



//DF>=2 ,保证termList中的单词至少出现在两个文本中

// 2010-03-18 修改

/*if (newT.getDf() >= 2) {

termList.add(newT);

}*/



termList.add(newT);

}

}



//遍历termList，计算IDF

for (int i = 0; i < termList.size(); i++) {

Term t = (Term) termList.get(i);



// IDF=(N+1)/df N is the total number of the documents

t.setIdf(Math.log((documents.length+1) / (double) t.getDf()));

}

}



// 计算TfIdf=tf*idf

public void computeTfIdf() {

for (int i = 0; i < documents.length; i++) {

for (int j = 0; j < documents[i].getTerms().size(); j++) {

Term t = (Term) (documents[i].getTerms().get(j));

int index = termList.indexOf(t);

Term newT = (Term) (termList.get(index));

t.setIdf(newT.getIdf());

t.setTfIdf(1.0 + Math.log(t.getTf()) * newT.getIdf());

}

}

}





//输出termList中的单词，及其属性信息（Tf Df Idf TfIdf）

public void printCollectionTermList() {



for (int i = 0; i < termList.size(); i++) {

Term t = (Term) termList.get(i);



System.out.print(t.getTerm() + "\t" + t.getTf() + "\t" + t.getDf()

+ "\t" + t.getIdf() + "\t" + t.getTfIdf());



System.out.println();

}



}



public void printTermTfIdf() {

printTermTfIdfOfDocuments(documents);

// printTermTfIdfOfDocuments(queryTerms);

}



//输出所有文本中单词的TfIdf

public void printTermTfIdfOfDocuments(Document[] docs) {

for (int i = 0; i < docs.length; i++) {

for (int j = 0; j < docs[i].getTerms().size(); j++) {

Term t = (Term) (docs[i].getTerms().get(j));

System.out.println(t.getTerm() + '\t' + t.getTfIdf()); 

}

}

}



//输出所有文本中term及其相应的Tf（出现的次数）

public void printDocuments() {

for (int i = 0; i < documents.length; i++) {

documents[i].printDocument();

}

}



//输出所有伪文本中term及其相应的Tf（出现的次数）

/*public void printQuerys() {

for (int i = 0; i < queryTerms.length; i++) {

queryTerms[i].printDocument();

}

}*/





/*public Document[] getQueryTerms() {

return queryTerms;

}



public void setQueryTerms(String[] querys) {

queryTerms = new Document[querys.length];

for (int i = 0; i < queryTerms.length; i++) {

queryTerms[i] = new Document();

queryTerms[i].setContent(querys[i]);

}

}*/



}

public static void main(String[] args) throws Exception {

// TODO Auto-generated method stub



String[] docs = {

"Human machine interface for Lab ABC computer applications ",

"A survey of user opinion of computer system response time",

"The EPS user interface management system",

"System and human system engineering testing of EPS",

"Relation of user-perceived response time to error measurement",

"The generation of random, binary, unordered trees",

"The intersection graph of paths in trees.",

"Graph minors IV: Widths of trees and well-quasi-ordering",

"Graph minors: A survey "};



//创建 allDocs 对象

Collection allDocs = new Collection(docs);



allDocs.process();

System.out.println("统计每个文档中单词的词频：");

//每篇文档里，每个单词出现的次数，即词频

allDocs.printDocuments();

System.out.println();

allDocs.computeIDF();

allDocs.computeTfIdf();

System.out.println("打印CollectionTermList：");

allDocs.printCollectionTermList();

System.out.println();

System.out.println("打印TermTfIdf：");

allDocs.printTermTfIdf();



}

}