1. Lucene Setting을 위해 소스를 다운 받는다.(나중에 Luke와의 Setting을 위해 3.5 Version 다운)
(http://grepcode.com/snapshot/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.5.0)
2. eclipse 환경에서 JAVA 프로젝트를 만들고, Indexer.java 소스코드를 import 한다.
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer.*;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.kr.KoreanAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Date;
public class Indexer {
public static void main(String[] args) throws Exception {
String idxPath = "D:\\IR\\index";
String dataPath ="D:\\IR\\data";
Directory idxDir = FSDirectory.open(new File(idxPath));
File dataDir = new File(dataPath);
long start = new Date().getTime();
int numIndexed = 0;
numIndexed = index(idxDir, dataDir);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
public static int index(Directory idxDir, File dataDir) throws IOException {
System.out.println("index------------------------");
int numIndexed = 0;
if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir
+ " does not exist or is not a directory");
}else{
System.out.println("data dir exist~!!!");
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new KeywordAnalyzer() );
System.out.println("conf----->"+conf.getAnalyzer());
IndexWriter writer = new IndexWriter(idxDir, conf);
indexDirectory(writer, dataDir);
numIndexed = writer.numDocs();
writer.optimize();
writer.close();
}
return numIndexed;
}
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
System.out.println("f.isDirectory");
indexDirectory(writer, f); // recurse
} else if (f.getName().endsWith(".txt")) {
System.out.println(".txt ===========>"+f.getName());
indexFile(writer, f);
}
}
}
private static void indexFile(IndexWriter writer, File f)
throws IOException {
if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = new Document();
doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8")));//UTF-8 ·Î...
doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,Field.Index.NO));
// System.out.println("doc--->"+doc.toString());
writer.addDocument(doc);
}
}
3. JAVA 코드를 넣으면 오류가 나기 때문에, 오류를 해결하기 위해 jar파일을 넣어준다.
4. 이제 Run을 시키기 위해 Directory를 2개 만들고(data와 index), 하나 data쪽에는 txt파일을 만들고
sample로 text파일을 넣어준다.
5. 결과 화면(자료를 수집한 모습)
'Study' 카테고리의 다른 글
[LINUX] OS bit 수 확인 방법, ulimit (0) | 2014.07.09 |
---|---|
File I/O java (0) | 2014.05.15 |
Springframework (0) | 2012.11.28 |
Luke - 3 (0) | 2012.10.05 |
Lucene - 1 (0) | 2012.10.04 |