67,512
社区成员
发帖
与我相关
我的任务
分享
FileItem fileItem = (FileItem) fileItr.next();
// System.out.println(item.getString());
if (!fileItem.isFormField()) {
String name = fileItem.getName();
try {
File file = new File(saveDir + name);
//上传的文件
Document document = new Document();
document.setDocLocation(saveDir);
document.setDocName(file.getName());
//向数据库中存储document
//documentDB.InsertDocument(document);
session.persist(document);
//Long docId = document.getDocId();
fileItem.write(file);
// 从此处开始对文本内容进行处理
String content = CUPFileUtils.readFileContent(file);
//未分词的句子
String[] sentences = CUPStringUtils.spliteContentWithSentence(content);
//setContent分词后的内容
String segContent = CUPStringUtils.splitContent(content, path);
//分词后的分句
//不能这样用,JE分词后标点就被删掉了。
String[] segSentences = CUPStringUtils.spliteContentWithSentence(segContent);
//对插入关系进行批处理
//Transaction tx = session.beginTransaction();
//控制插入的数量
int index = 0;
//从每个句子中查找关系,有则存储到数据库中。
for(String sentence : sentences) {
sentence = CUPStringUtils.splitContent(sentence, path);
String[] words = sentence.split(" ");
TextRelation relation = CUPStringUtils.findRelation2(words, verbs, subjecs, objects);
if(relation != null) {
index++;
session.persist(relation);
document.getRelations().add(relation);
//每插入30个数据,刷新一下
/*if(index % 30 == 0) {
session.flush();
session.clear();
}*/
}
}
//重新开始计数
index = 0;
//Transaction tx2 = session.beginTransaction();
// 统计词频
Map<String, Integer> map = CUPStringUtils
.countWords(segContent);
// 对结果排序
Map<String, Integer> sortMap = CUPStringUtils
.sortMap(map);
//全部的词频
int totalWords = map.get("#全部词频");
for(String word : sortMap.keySet()) {
double weigh = (double)sortMap.get(word)/totalWords;
//词频大于10或者占总词数的百分之一,就看做是高频词
if(sortMap.get(word) > 10 || weigh>0.01) {
KeyWord keyWord = new KeyWord();
index ++;
//设置词性
if(wordList.containsKey(word)) {
keyWord.setWordMark(wordList.get(word));
//假如词性为名词,则把含有名词的句子存入数据库
if(wordList.get(word).equals("n") || wordList.get(word).equals("N")) {
Set<String> ss = CUPStringUtils.sentencesWithKeyWord(word, sentences);
keyWord.getSentences().addAll(ss);
}
} else{
keyWord.setWordMark("未知词性");
}
keyWord.setWord(word);
keyWord.setFrequency(sortMap.get(word));
keyWord.setWeigh(weigh);
//关键词存储到数据库
session.persist(keyWord);
document.getKeywordSet().add(keyWord);
/*if(index % 30 == 0) {
session.flush();
session.clear();
}*/
}
}
tx.commit();
//tx2.commit();
HibernateSessionFactory.closeSession();
System.out.println("提取知识结束========");
public class TextRelation {
private Long id;
private String subject;
private String predicate;
private String object;
}
public class KeyWord {
private Long wordId;
private String word;
private String wordMark;
private Integer frequency;
private Double weigh;
private Set<String> sentences = new HashSet<String>();
public class Document {
private Long docId;
private String docLocation;
private String docName;
private Set<KeyWord> keywordSet = new HashSet<KeyWord>();
private Set<TextRelation> relations = new HashSet<TextRelation>();
<class name="TextRelation" table="relation">
<id name="id" column="relation_id">
<generator class="native" />
</id>
<property name="subject" column="subject" type="string"
not-null="true" />
<property name="predicate" column="predicate" type="string"
not-null="true" />
<property name="object" column="object" type="string"
not-null="true" />
</class>
<class name="KeyWord" table="keyword">
<id name="wordId" column="wordid">
<generator class="native" />
</id>
<property name="word" column="word" type="string"
not-null="true" />
<property name="wordMark" column="wordmark" type="string"
not-null="true" />
<property name="frequency" column="frequency" type="integer"
not-null="true" />
<property name="weigh" column="weigh" type="double"
not-null="true" />
<set name="sentences" table="sentene_with_keyword">
<key column="wordid"/>
<element type="string" column="sentence"></element>
</set>
</class>
<class name="Document" table="document">
<id name="docId" column="docid">
<generator class="native" />
</id>
<property name="docName" column="docName" type="string"
not-null="true" />
<property name="docLocation" column="docLocation" type="string"
not-null="true" />
<set name="keywordSet" table="doc_keyword">
<key column="docid"></key>
<many-to-many column="wordid"
unique="true"
class="KeyWord"/>
</set>
<set name="relations" table="doc_relation">
<key column="docid"></key>
<many-to-many column="relationid"
unique="true"
class="TextRelation"/>
</set>
</class>