lucene+heritrix day3(2)

shibenjie 2008-04-03 03:58:10

/**
* 第三方提供的搜索,正则匹配搜索
*
* @param path
* @throws IOException
*/
public static void regexQuery(String path) throws IOException {
IndexWriter writer = new IndexWriter(path, new StandardAnalyzer(),
false);
writer.setUseCompoundFile(false);

Document doc1 = new Document();
Document doc2 = new Document();
Document doc3 = new Document();

Field f1 = new Field("url",
"http://www.abc.com/product?typeid=1&category=10&item=34",
Field.Store.YES, Field.Index.TOKENIZED);
Field f2 = new Field("url",
"http://www.def.com/product/show?typeid=3&catagory=10&item=23",
Field.Store.YES, Field.Index.TOKENIZED);
Field f3 = new Field(
"url",
"http://www.ghi.com/product/list?catagory=4&typeid=19&order=32",
Field.Store.YES, Field.Index.TOKENIZED);
doc1.add(f1);
doc2.add(f2);
doc3.add(f3);
// writer.addDocument(doc1);
// writer.addDocument(doc2);
// writer.addDocument(doc3);

writer.close();

IndexSearcher searcher = new IndexSearcher(path);
// 创建一个正则表达式,用于匹配域名为abc.com的地址
String regex = "http://[a-z]{1,3}\\.abc\\.com/.*";
// 构建一个词条Term
Term t = new Term("url", regex);
// 创建正则查询
RegexQuery query = new RegexQuery(t);
Query q = new TermQuery(new Term("url", "http"));
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
}
searcher.close();
}

/**
* 跨度搜索
*
* @param path
* @throws IOException
* SpanQuery的跨度是指:数据源中每个词条的位置构成的一个跨度,在某个跨度范围内, 查找关键字并匹配文档,这就是
* SpanQuery所做的事 SpanQuery是一个抽象类,实际搜索功能是由它的多个子类来完成的
*/
public static void spanQuery(String path) throws IOException {
IndexWriter writer = new IndexWriter(path, new StandardAnalyzer(),
false);
Document doc1 = new Document();

// 设置内容用空格分开
Field f1 = new Field("content", "aa bb cc dd ee ff gg hh ii jj kk",
Field.Store.YES, Field.Index.TOKENIZED);
doc1.add(f1);
writer.addDocument(doc1);
writer.close();
IndexSearcher searcher = new IndexSearcher(path);
// 创建四个词条
Term t1 = new Term("content", "aa");
Term t2 = new Term("content", "cc");
Term t3 = new Term("content", "gg");
Term t4 = new Term("content", "kk");

// 创建四个SpanTermQuery
// SpanTermQuery的功能与TermQuery一样,只不过是把文档中数据源中的词语位置都标记下来了
SpanTermQuery s1 = new SpanTermQuery(t1);
SpanTermQuery s2 = new SpanTermQuery(t2);
SpanTermQuery s3 = new SpanTermQuery(t3);
SpanTermQuery s4 = new SpanTermQuery(t4);

// 先创建两个SpanNearQuery,参数为上面的四个SpanTermQuery
// SpanNearQuery的功能与PhraseQuery的功能类似,只不过PharseQuery查询的是具有一定无关
// 单词长度的查询,而SpanNearQuery所匹配不一定是短语,它还有可能把SpanQuery的查询结果作为查询
// 对象。下面的构造函数:第二个参数相当于坡度slop,规定查询数据源的第一个必须出现查询的对象,
// 第三个参数false,表示要查询的多个词组是否要按照顺序出现在数据源中
SpanNearQuery query1 = new SpanNearQuery(new SpanQuery[] { s1, s2 }, 1,
false);
SpanNearQuery query2 = new SpanNearQuery(new SpanQuery[] { s3, s4 }, 3,
false);

// 在创建一个SpanNearQuery,参数为上面两个SpanNearQuery
// 意义就是在以上两个SpanNearQuery的搜索的结果中,进行搜索
SpanNearQuery query3 = new SpanNearQuery(new SpanNearQuery[] { query1,
query2 }, 3, false);

// SpanOrQuery是合并SpanQuery[]的搜索结果
SpanOrQuery query4 = new SpanOrQuery(new SpanQuery[] { query1, query2 });

// SpanNotQuery意思是从第一个SpanQuery的查询结果中去掉第二个SpanQuery的查询结果
SpanNotQuery query5 = new SpanNotQuery(query1, query2);
Hits hits = searcher.search(query3);
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
}
searcher.close();
}

/**
* 通配符搜索
*
* @param path
* @throws IOException
*/
public static void wildcardQuery(String path) throws IOException {
IndexSearcher searcher = new IndexSearcher(path);
// 构建一个带有通配符的词条
Term t = new Term("content", "?o*");
WildcardQuery query = new WildcardQuery(t);

Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
}

searcher.close();
}

/**
* 模糊搜索
*
* @param path
* @throws IOException
*/
public static void fuzzyQuery(String path) throws IOException {
IndexWriter writer = new IndexWriter(path, new StandardAnalyzer(),
false);
writer.setUseCompoundFile(false);

Document doc1 = new Document();
Document doc2 = new Document();
Document doc3 = new Document();
Document doc4 = new Document();
Document doc5 = new Document();
Document doc6 = new Document();

Field f1 = new Field("content", "word", Field.Store.YES,
Field.Index.TOKENIZED);
Field f2 = new Field("content", "work", Field.Store.YES,
Field.Index.TOKENIZED);
Field f3 = new Field("content", "seed", Field.Store.YES,
Field.Index.TOKENIZED);
Field f4 = new Field("content", "sword", Field.Store.YES,
Field.Index.TOKENIZED);
Field f5 = new Field("content", "world", Field.Store.YES,
Field.Index.TOKENIZED);
Field f6 = new Field("content", "ford", Field.Store.YES,
Field.Index.TOKENIZED);

doc1.add(f1);
doc2.add(f2);
doc3.add(f3);
doc4.add(f4);
doc5.add(f5);
doc6.add(f6);

writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.addDocument(doc4);
writer.addDocument(doc5);
writer.addDocument(doc6);

writer.close();
IndexSearcher searcher = new IndexSearcher(path);
// 构建一个Term,然后对其进行模糊查找
Term t = new Term("content", "work");
FuzzyQuery query = new FuzzyQuery(t);
// FuzzyQuery还有两个构造函数,来限制模糊匹配的程度
// 在FuzzyQuery中,默认的匹配度是0.5,当这个值越小时,通过模糊查找出的文档的匹配程度就
// 越低,查出的文档量就越多,反之亦然
FuzzyQuery query1 = new FuzzyQuery(t, 0.1f);
FuzzyQuery query2 = new FuzzyQuery(t, 0.1f, 1);
Hits hits = searcher.search(query2);
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
}
searcher.close();
}
...全文
64 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

62,614

社区成员

发帖
与我相关
我的任务
社区描述
Java 2 Standard Edition
社区管理员
  • Java SE
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧