以下文章主要介绍的是如何进行Lucene进行中文分词和全文搜索
package com.sf.module.extension.lucene;
import java.io.File;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
public class LuceneUtils {
public static boolean add(String indexPath, String lucenceKey, LuceneDB value) {
Analyzer analyzer = new IKAnalyzer();
try {
File file = new File(indexPath);
Directory dir = FSDirectory.open(file);
String lucene_pk = value.getDbtable() + "_" + value.getPk();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, analyzer);
// delete exists index
try {
IndexReader reader = IndexReader.open(dir, false);
Term pkTerm = new Term("lucene_pk", lucene_pk);
reader.deleteDocuments(pkTerm);
reader.close();
} catch (Exception e) {
// fix file not exist
}
// create index
IndexWriter writer = new IndexWriter(dir, config);
writer.setMergeFactor(100);
writer.setMaxBufferedDocs(100);
Document doc = new Document();
Field content = new Field(lucenceKey, value.getConetnt(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(content);
doc.add(new Field("pk", String.valueOf(value.getPk()), Field.Store.YES, Field.Index.NO));
doc.add(new Field("dbtable", String.valueOf(value.getDbtable()), Field.Store.YES, Field.Index.NO));
doc.add(new Field("lucene_pk", lucene_pk, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.setBoost(value.getOrderby());
writer.addDocument(doc);
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
return true;
}
public static Set<LuceneDB> search(String indexPath, String luceneKey, String word, int maxcount) {
Set<LuceneDB> rst = new HashSet<LuceneDB>();
try {
StringReader reader = new StringReader(word);
File file = new File(indexPath);
Directory dir = FSDirectory.open(file);
IndexSearcher searcher = new IndexSearcher(dir);
Query query = IKQueryParser.parse(luceneKey, word);
TopDocs topDocs = searcher.search(query, maxcount);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
ScoreDoc scoreDoc = scoreDocs[i];
Document targetDoc = searcher.doc(scoreDoc.doc);
LuceneDB db = new LuceneDB();
db.fromDocument(targetDoc, luceneKey);
/*System.out.println(db.getConetnt() + ":");
System.out.println(scoreDoc.score);
System.out.println(searcher.explain(query, scoreDoc.doc));
System.out.println("----------------------");*/
rst.add(db);
}
} catch (Exception e) {
e.printStackTrace();
}
return rst;
}
public static void addIKAnalyzerWord(String... words){
Collection<String> datas = new HashSet<String>();
for(String word:words){
datas.add(word);
}
Dictionary.loadExtendWords(datas);
}
public static void main(String[] args) {
addIKAnalyzerWord("咨","棕");
LuceneDB value = new LuceneDB();
value.setConetnt("你好,请问我的棕子咨询的价格是什么");
value.setPk(1L);
value.setDbtable("records");
value.setOrderby(0.0f);
LuceneDB userValue = new LuceneDB();
userValue.setConetnt("你好,请问我的棕的上来价格是咨什么");
userValue.setPk(2L);
userValue.setDbtable("users");
userValue.setOrderby(0.0f);
LuceneDB userValue2 = new LuceneDB();
userValue2.setConetnt("买棕了要买的上来的方式咨询");
userValue2.setPk(3L);
userValue2.setDbtable("users");
userValue2.setOrderby(0.0f);
LuceneUtils.add("d://index2", "lucene", value);
LuceneUtils.add("d://index2", "lucene", userValue);
LuceneUtils.add("d://index2", "lucene", userValue2);
Set<LuceneDB> rst = LuceneUtils.search("d://index2", "lucene", "咨", 50);
for (LuceneDB luceneDB : rst) {
System.out.println("id:" + luceneDB.getPk() + "," + "table:" + luceneDB.getDbtable() + "," + luceneDB.getConetnt());
}
System.out.println("-------------------------------------");
Set<LuceneDB> rst2 = LuceneUtils.search("d://index2", "lucene", "棕", 50);
for (LuceneDB luceneDB : rst2) {
System.out.println("id:" + luceneDB.getPk() + "," + "table:" + luceneDB.getDbtable() + "," + luceneDB.getConetnt());
}
}
}
package com.sf.module.extension.lucene;
import java.io.Serializable;
import org.apache.lucene.document.Document;
public class LuceneDB {
private Serializable pk;
private String dbtable;
private String conetnt;
private float orderby;
public float getOrderby() {
return orderby;
}
public void setOrderby(float orderby) {
this.orderby = orderby;
}
public Serializable getPk() {
return pk;
}
public void setPk(Serializable pk) {
this.pk = pk;
}
public String getConetnt() {
return conetnt;
}
public void setConetnt(String conetnt) {
this.conetnt = conetnt;
}
public String getDbtable() {
return dbtable;
}
public void setDbtable(String dbtable) {
this.dbtable = dbtable;
}
public LuceneDB fromDocument(Document doc,String luceneKey){
this.pk=doc.get("pk");
this.conetnt=doc.get(luceneKey);
this.dbtable=doc.get("dbtable");
return this;
}
@Override
public boolean equals(Object target) {
LuceneDB obj=(LuceneDB)target;
if(obj.getPk().equals(this.getPk())
&& obj.getDbtable().equals(this.getDbtable())){
return true;
}
return false;
}
@Override
public int hashCode() {
return (this.getPk()+this.getDbtable()).hashCode();
}
}
如何使用Lucene的中文分词搜索
来源:互联网 发布日期:2011-10-05 21:08:05 浏览:25529次
导读:以下文章主要介绍的是如何进行Lucene进行中文分词和全文搜索 package com.sf.module.extension.lucene; import java.io.File; import java.io.Serializable; imp...
相关热词: lucene 中文分词 lucene数据库应用 如何使用
相关内容
AiLab云推荐
最新资讯
- 纳米缝合让复合材料更轻更坚韧
- Meta全新自研AI芯片曝光:性能相比MTIA v1提升了3倍!
- M4芯片将专注于AI!苹果据称拟全面升级Mac产品线,股价涨超4%
- 英特尔甩出全新AI训练芯片!跑千亿大模型速度超H200,罕见披露AI浮点性能
- 英特尔突袭英伟达H100,新AI芯片训练快40%,推理快50%,CEO蹦迪庆祝
- 对话东方晶源:打造中国芯片制造的GoldenFlow
- 一颗改变了世界的芯片
- 英特尔展示多模块芯片,预计为 Gaudi 3 人工智能加速器
- 台地震影响全球半导体格局?分析称可能会让供应链多元化呼声更强烈
- 三星 Exynos 2500 芯片再爆料:加码NPU芯片,构建更丰富 AI 体验
本月热点
热门排行
-
ChatGPT 设计了一款芯片
阅读量:196167
-
2023年全球风投额创5年最低,机构:今年会有更多创业公司倒闭
阅读量:195319
-
计算效率提升超60倍!中国公司杉数科技用GPU芯片开启运筹学新的“大航海时代”|钛媒体焦点
阅读量:191924
-
2023,AI创业者的50条反思
阅读量:102946
-
专访清华大学脑机接口科研团队负责人:中美“脑机接口”下一突破是什么?
阅读量:63437
-
2024,AI芯片之争加剧
阅读量:24135