본문 바로가기

Lucene

[Lucene] IndexCommit - lucene4.2.1

IndexDeletionPolicy에 대한 내용을 보기전에 IndexCommit에 대한 학습예제를 조금 

작성해보았습니다.

lucene 소스에 있는 IndexCommit 테스트케이스에는 두개의 IndexCommit에 대한 비교만

들어가 있네요.. 실제 commit시에 어떠한 값을 가지고 있는지 확인해보고 싶어서 아래와 같이

출력만하는 학습테스트 클래스를 작성하였습니다.



package com.tistory.devyongsik.commit;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class CommitTest {
@Test
public void commitTest() throws IOException {
String a = "learning perl learning java learning ruby";
String b = "perl test t";
String c = "perl test t learning";
Directory dir = new RAMDirectory();
//Directory dir = FSDirectory.open(new File("/Users/need4spd/Programming/Java/workspace/aboutLucene_4/tempindex"));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); //문서 내용을 분석 할 때 사용 될 Analyzer
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc); //8. 드디어 IndexWriter를 생성합니다.
Document doc1 = new Document();
FieldType f1type = new FieldType();
f1type.setIndexed(true);
f1type.setStored(false);
f1type.setTokenized(true);
f1type.setStoreTermVectors(true);
f1type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field f1 = new Field("f", a, f1type);
doc1.add(f1);
writer.addDocument(doc1);
writer.commit();
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexCommit indexCommit = directoryReader.getIndexCommit();
System.out.println(indexCommit.getGeneration());
System.out.println(indexCommit.getSegmentCount());
System.out.println(indexCommit.getSegmentsFileName());
System.out.println(indexCommit.getFileNames());
System.out.println(indexCommit.isDeleted());
System.out.println("############################################");
Document doc2 = new Document();
Field f2 = new Field("f", b, f1type);
doc2.add(f2);
writer.addDocument(doc2);
writer.commit();
DirectoryReader directoryReader2 = DirectoryReader.open(dir);
IndexCommit indexCommit2 = directoryReader2.getIndexCommit();
System.out.println(indexCommit2.getGeneration());
System.out.println(indexCommit2.getSegmentCount());
System.out.println(indexCommit2.getSegmentsFileName());
System.out.println(indexCommit2.getFileNames());
System.out.println(indexCommit2.isDeleted());
System.out.println("############################################");
Document doc3 = new Document();
Field f3 = new Field("f", c, f1type);
doc3.add(f3);
writer.addDocument(doc3);
writer.commit();
DirectoryReader directoryReader3 = DirectoryReader.open(dir);
IndexCommit indexCommit3 = directoryReader3.getIndexCommit();
System.out.println(indexCommit3.getGeneration());
System.out.println(indexCommit3.getSegmentCount());
System.out.println(indexCommit3.getSegmentsFileName());
System.out.println(indexCommit3.getFileNames());
System.out.println(indexCommit3.isDeleted());
System.out.println("############################################");
//delete
Term t = new Term("f", "java");
writer.deleteDocuments(t);
DirectoryReader directoryReader4 = DirectoryReader.open(dir);
IndexCommit indexCommit4 = directoryReader4.getIndexCommit();
System.out.println(indexCommit4.getGeneration());
System.out.println(indexCommit4.getSegmentCount());
System.out.println(indexCommit4.getSegmentsFileName());
System.out.println(indexCommit4.getFileNames());
System.out.println(indexCommit4.isDeleted());
writer.commit();
System.out.println("############################################");
DirectoryReader directoryReader5 = DirectoryReader.open(dir);
IndexCommit indexCommit5 = directoryReader5.getIndexCommit();
System.out.println(indexCommit5.getGeneration());
System.out.println(indexCommit5.getSegmentCount());
System.out.println(indexCommit5.getSegmentsFileName());
System.out.println(indexCommit5.getFileNames());
System.out.println(indexCommit5.isDeleted());
System.out.println("############################################");
writer.deleteDocuments(new Term("f", "perl"));
writer.commit();
DirectoryReader directoryReader6 = DirectoryReader.open(dir);
IndexCommit indexCommit6 = directoryReader6.getIndexCommit();
System.out.println(indexCommit6.getGeneration());
System.out.println(indexCommit6.getSegmentCount());
System.out.println(indexCommit6.getSegmentsFileName());
System.out.println(indexCommit6.getFileNames());
System.out.println(indexCommit6.isDeleted());
writer.close();
}
}
//Result
1
1
segments_1
[_0.fnm, _0_Lucene41_0.pos, _0.tvd, _0.nvm, _0_Lucene41_0.pay, _0_Lucene41_0.doc, _0.tvx, segments_1, _0.nvd, _0.fdx, _0.si, _0_Lucene41_0.tim, _0.fdt, _0_Lucene41_0.tip]
false
############################################
2
2
segments_2
[_1_Lucene41_0.doc, _0_Lucene41_0.pos, _1.fnm, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _1_Lucene41_0.tip, _0.nvm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _0.tvx, _0_Lucene41_0.doc, _1.fdt, _1.si, segments_2, _0.fdx, _1.tvd, _0.fdt]
false
############################################
3
3
segments_3
[_2.si, _2_Lucene41_0.pos, _1_Lucene41_0.doc, _2_Lucene41_0.tim, _0_Lucene41_0.pos, _2_Lucene41_0.tip, _1.fnm, _2.tvx, _2.tvd, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _2.fdt, _1_Lucene41_0.tip, _2_Lucene41_0.doc, _0.nvm, _2.fdx, _2.fnm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _2.nvm, _2_Lucene41_0.pay, _0.tvx, _0_Lucene41_0.doc, _1.fdt, _1.si, _2.nvd, _0.fdx, _1.tvd, segments_3, _0.fdt]
false
############################################
3
3
segments_3
[_2.si, _2_Lucene41_0.pos, _1_Lucene41_0.doc, _2_Lucene41_0.tim, _0_Lucene41_0.pos, _2_Lucene41_0.tip, _1.fnm, _2.tvx, _2.tvd, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _2.fdt, _1_Lucene41_0.tip, _2_Lucene41_0.doc, _0.nvm, _2.fdx, _2.fnm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _2.nvm, _2_Lucene41_0.pay, _0.tvx, _0_Lucene41_0.doc, _1.fdt, _1.si, _2.nvd, _0.fdx, _1.tvd, segments_3, _0.fdt]
false
############################################
4
2
segments_4
[_2_Lucene41_0.pos, _2.si, _1_Lucene41_0.doc, _2_Lucene41_0.tim, _2_Lucene41_0.tip, _1.fnm, _2.tvx, _2.tvd, _1_Lucene41_0.pay, _1.tvx, _1.nvm, _1_Lucene41_0.tim, _1.nvd, _2.fdt, _1_Lucene41_0.tip, _2_Lucene41_0.doc, _2.fdx, _2.fnm, _1.fdx, _1_Lucene41_0.pos, _2.nvm, _2_Lucene41_0.pay, _1.fdt, _1.si, _2.nvd, _1.tvd, segments_4]
false
############################################
5
0
segments_5
[segments_5]
false

결과를 보시면..

1. commit이 일어날때마다 generation을 계속 증가합니다.

2. commit이 일어날때마다 segmentcount 역시 증가합니다.

3. segmentcount의 경우 위 예제에서의 결과만으로 보면, 문서를 하나씩 indexWriter에 add 후 commit을 하였기 때문에 문서 하나가 하나의 세그먼트에 들어갔고, 그 결과 "java" term으로 삭제하고 commit을 했을 때 generation을 4로 증가하였지만, segmentcount는 문서 하나가 삭제 되면서 2로 줄어든것이 확인 가능합니다.

만약, document1,2를 한번에 add 후 commit을 하였다면 위 결과는 조금 다를 것입니다. 그 코드는 밑에 다시 붙여넣을게요..

4. segmentFileName역시 segment_N의 형태로 나타납니다.

5. commit후 남아있는 파일 리스트를 fileName으로 받아올 수 있습니다.

6. isDeleted의 값은 이 테스트에서는 어디에 사용되고 무엇을 뜻하는지 잘 알 수 없네요..



위 3번에서 말씀드린 예제를 보시면..



package com.tistory.devyongsik.commit;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class CommitTest {
@Test
public void commitMergSegmentTest() throws IOException {
String a = "learning perl learning java learning ruby";
String b = "perl test t";
String c = "perl test t learning";
Directory dir = new RAMDirectory();
//Directory dir = FSDirectory.open(new File("/Users/need4spd/Programming/Java/workspace/aboutLucene_4/tempindex"));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); //문서 내용을 분석 할 때 사용 될 Analyzer
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc); //8. 드디어 IndexWriter를 생성합니다.
Document doc1 = new Document();
FieldType f1type = new FieldType();
f1type.setIndexed(true);
f1type.setStored(false);
f1type.setTokenized(true);
f1type.setStoreTermVectors(true);
f1type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field f1 = new Field("f", a, f1type);
doc1.add(f1);
writer.addDocument(doc1);
Document doc2 = new Document();
Field f2 = new Field("f", b, f1type);
doc2.add(f2);
writer.addDocument(doc2);
writer.commit();
DirectoryReader directoryReader2 = DirectoryReader.open(dir);
IndexCommit indexCommit2 = directoryReader2.getIndexCommit();
System.out.println(indexCommit2.getGeneration());
System.out.println(indexCommit2.getSegmentCount());
System.out.println(indexCommit2.getSegmentsFileName());
System.out.println(indexCommit2.getFileNames());
System.out.println(indexCommit2.isDeleted());
System.out.println("############################################");
Document doc3 = new Document();
Field f3 = new Field("f", c, f1type);
doc3.add(f3);
writer.addDocument(doc3);
writer.commit();
DirectoryReader directoryReader3 = DirectoryReader.open(dir);
IndexCommit indexCommit3 = directoryReader3.getIndexCommit();
System.out.println(indexCommit3.getGeneration());
System.out.println(indexCommit3.getSegmentCount());
System.out.println(indexCommit3.getSegmentsFileName());
System.out.println(indexCommit3.getFileNames());
System.out.println(indexCommit3.isDeleted());
System.out.println("############################################");
//delete
Term t = new Term("f", "java");
writer.deleteDocuments(t);
DirectoryReader directoryReader4 = DirectoryReader.open(dir);
IndexCommit indexCommit4 = directoryReader4.getIndexCommit();
System.out.println(indexCommit4.getGeneration());
System.out.println(indexCommit4.getSegmentCount());
System.out.println(indexCommit4.getSegmentsFileName());
System.out.println(indexCommit4.getFileNames());
System.out.println(indexCommit4.isDeleted());
writer.commit();
System.out.println("############################################");
DirectoryReader directoryReader5 = DirectoryReader.open(dir);
IndexCommit indexCommit5 = directoryReader5.getIndexCommit();
System.out.println(indexCommit5.getGeneration());
System.out.println(indexCommit5.getSegmentCount());
System.out.println(indexCommit5.getSegmentsFileName());
System.out.println(indexCommit5.getFileNames());
System.out.println(indexCommit5.isDeleted());
System.out.println("############################################");
writer.deleteDocuments(new Term("f", "perl"));
writer.commit();
DirectoryReader directoryReader6 = DirectoryReader.open(dir);
IndexCommit indexCommit6 = directoryReader6.getIndexCommit();
System.out.println(indexCommit6.getGeneration());
System.out.println(indexCommit6.getSegmentCount());
System.out.println(indexCommit6.getSegmentsFileName());
System.out.println(indexCommit6.getFileNames());
System.out.println(indexCommit6.isDeleted());
writer.close();
}
}
// Result....
1
1
segments_1
[_0.fnm, _0_Lucene41_0.pos, _0.tvd, _0.nvm, _0_Lucene41_0.pay, _0_Lucene41_0.doc, _0.tvx, segments_1, _0.nvd, _0.fdx, _0.si, _0_Lucene41_0.tim, _0.fdt, _0_Lucene41_0.tip]
false
############################################
2
2
segments_2
[_1_Lucene41_0.doc, _0_Lucene41_0.pos, _1.fnm, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _1_Lucene41_0.tip, _0.nvm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _0.tvx, _0_Lucene41_0.doc, _1.fdt, _1.si, segments_2, _0.fdx, _1.tvd, _0.fdt]
false
############################################
2
2
segments_2
[_1_Lucene41_0.doc, _0_Lucene41_0.pos, _1.fnm, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _1_Lucene41_0.tip, _0.nvm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _0.tvx, _0_Lucene41_0.doc, _1.fdt, _1.si, segments_2, _0.fdx, _1.tvd, _0.fdt]
false
############################################
3
2
segments_3
[_1_Lucene41_0.doc, _0_Lucene41_0.pos, _1.fnm, _1_Lucene41_0.pay, _0.nvd, _0.si, _0_Lucene41_0.tim, _0_Lucene41_0.tip, _1.nvm, _1.tvx, _0.fnm, _1_Lucene41_0.tim, _0.tvd, _1.nvd, _0_1.del, _1_Lucene41_0.tip, _0.nvm, _1_Lucene41_0.pos, _1.fdx, _0_Lucene41_0.pay, _1.fdt, _0.tvx, _0_Lucene41_0.doc, _1.si, _0.fdx, _1.tvd, segments_3, _0.fdt]
false
############################################
4
0
segments_4
[segments_4]
false

위와 같이 나오네요... "java" term으로 삭제시에 segementCount가 2로 유지되는것을 보실 수 있습니다.


이 내용을 바탕으로 IndexDeletionPolicy 부분을 좀 살펴봐야겠습니다.