6969import org .apache .lucene .index .IndexWriterConfig ;
7070import org .apache .lucene .index .IndexWriterConfig .OpenMode ;
7171import org .apache .lucene .index .IndexableField ;
72+ import org .apache .lucene .index .MultiBits ;
7273import org .apache .lucene .index .MultiTerms ;
7374import org .apache .lucene .index .PostingsEnum ;
7475import org .apache .lucene .index .StoredFields ;
8586import org .apache .lucene .store .NativeFSLockFactory ;
8687import org .apache .lucene .store .NoLockFactory ;
8788import org .apache .lucene .store .SimpleFSLockFactory ;
89+ import org .apache .lucene .util .Bits ;
8890import org .apache .lucene .util .BytesRef ;
8991import org .jetbrains .annotations .NotNull ;
9092import org .jetbrains .annotations .Nullable ;
@@ -136,6 +138,8 @@ public class IndexDatabase {
136138
137139 private static final Set <String > REVERT_COUNTS_FIELDS ;
138140
141+ private static final Set <String > LIVE_CHECK_FIELDS ;
142+
139143 private static final Object INSTANCE_LOCK = new Object ();
140144
141145 /**
@@ -169,6 +173,7 @@ public class IndexDatabase {
169173 private List <String > directories ;
170174 private LockFactory lockFactory ;
171175 private final BytesRef emptyBR = new BytesRef ("" );
176+ private final Set <String > deletedUids = new HashSet <>();
172177
173178 // Directory where we store indexes
174179 public static final String INDEX_DIR = "index" ;
@@ -177,6 +182,8 @@ public class IndexDatabase {
177182
178183 private final IndexDownArgsFactory indexDownArgsFactory ;
179184
185+ private final IndexWriterConfigFactory indexWriterConfigFactory ;
186+
180187 /**
181188 * Create a new instance of the Index Database. Use this constructor if you
182189 * don't use any projects
@@ -198,6 +205,24 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
198205 indexDownArgsFactory = factory ;
199206 this .project = project ;
200207 lockFactory = NoLockFactory .INSTANCE ;
208+ indexWriterConfigFactory = new IndexWriterConfigFactory ();
209+ initialize ();
210+ }
211+
212+ /**
213+ * Create a new instance of an Index Database for a given project.
214+ *
215+ * @param project the project to create the database for
216+ * @param indexDownArgsFactory {@link IndexDownArgsFactory} instance
217+ * @param indexWriterConfigFactory {@link IndexWriterConfigFactory} instance
218+ * @throws java.io.IOException if an error occurs while creating directories
219+ */
220+ public IndexDatabase (Project project , IndexDownArgsFactory indexDownArgsFactory ,
221+ IndexWriterConfigFactory indexWriterConfigFactory ) throws IOException {
222+ this .indexDownArgsFactory = indexDownArgsFactory ;
223+ this .project = project ;
224+ lockFactory = NoLockFactory .INSTANCE ;
225+ this .indexWriterConfigFactory = indexWriterConfigFactory ;
201226 initialize ();
202227 }
203228
@@ -215,6 +240,10 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
215240 REVERT_COUNTS_FIELDS .add (QueryBuilder .PATH );
216241 REVERT_COUNTS_FIELDS .add (QueryBuilder .NUML );
217242 REVERT_COUNTS_FIELDS .add (QueryBuilder .LOC );
243+
244+ LIVE_CHECK_FIELDS = new HashSet <>();
245+ LIVE_CHECK_FIELDS .add (QueryBuilder .U );
246+ LIVE_CHECK_FIELDS .add (QueryBuilder .PATH );
218247 }
219248
220249 /**
@@ -582,11 +611,7 @@ public void update() throws IOException {
582611
583612 IOException finishingException = null ;
584613 try {
585- Analyzer analyzer = AnalyzerGuru .getAnalyzer ();
586- IndexWriterConfig iwc = new IndexWriterConfig (analyzer );
587- iwc .setOpenMode (OpenMode .CREATE_OR_APPEND );
588- iwc .setRAMBufferSizeMB (env .getRamBufferSize ());
589- writer = new IndexWriter (indexDirectory , iwc );
614+ writer = new IndexWriter (indexDirectory , indexWriterConfigFactory .get ());
590615 writer .commit (); // to make sure index exists on the disk
591616 completer = new PendingFileCompleter ();
592617
@@ -610,6 +635,7 @@ public void update() throws IOException {
610635
611636 String startUid = Util .path2uid (dir , "" );
612637 reader = DirectoryReader .open (indexDirectory ); // open existing index
638+ setupDeletedUids ();
613639 countsAggregator = new NumLinesLOCAggregator ();
614640 settings = readAnalysisSettings ();
615641 if (settings == null ) {
@@ -735,10 +761,63 @@ public void update() throws IOException {
735761 }
736762 }
737763
764+ /**
765+ * The traversal of the uid terms done in {@link #processFile(IndexDownArgs, File, String)}
766+ * and {@link #processFileIncremental(IndexDownArgs, File, String)} needs to skip over deleted documents
767+ * that are often found in multi-segment indexes. This method stores the uids of these documents
768+ * and is expected to be called before the traversal for the top level directory is started.
769+ * @throws IOException if the index cannot be read for some reason
770+ */
771+ private void setupDeletedUids () throws IOException {
772+ // This method might be called repeatedly from within the same IndexDatabase instance
773+ // for various directories so the map needs to be reset so that it does not contain unrelated uids.
774+ deletedUids .clear ();
775+
776+ Bits liveDocs = MultiBits .getLiveDocs (reader ); // Will return null if there are no deletions.
777+ if (liveDocs == null ) {
778+ LOGGER .log (Level .FINEST , "no deletions found in {0}" , reader );
779+ return ;
780+ }
781+
782+ Statistics stat = new Statistics ();
783+ LOGGER .log (Level .FINEST , "traversing the documents in {0} to collect uids of deleted documents" ,
784+ indexDirectory );
785+ for (int i = 0 ; i < reader .maxDoc (); i ++) {
786+ if (!liveDocs .get (i )) {
787+ Document doc = reader .document (i , LIVE_CHECK_FIELDS ); // use limited-field version
788+ IndexableField field = doc .getField (QueryBuilder .U );
789+ if (field != null ) {
790+ if (LOGGER .isLoggable (Level .FINEST )) {
791+ String uidString = field .stringValue ();
792+ LOGGER .log (Level .FINEST , "adding ''{0}'' at {1} to deleted uid set" ,
793+ new Object []{Util .uid2url (uidString ), Util .uid2date (uidString )});
794+ }
795+ deletedUids .add (field .stringValue ());
796+ }
797+ }
798+ }
799+ stat .report (LOGGER , Level .FINEST , String .format ("found %s deleted documents in %s" ,
800+ deletedUids .size (), indexDirectory ));
801+ }
802+
803+ private void logIgnoredUid (String uid ) {
804+ LOGGER .log (Level .FINEST , "ignoring deleted document for {0} at {1}" ,
805+ new Object []{Util .uid2url (uid ), Util .uid2date (uid )});
806+ }
807+
738808 private void processTrailingTerms (String startUid , boolean usedHistory , IndexDownArgs args ) throws IOException {
739809 while (uidIter != null && uidIter .term () != null
740810 && uidIter .term ().utf8ToString ().startsWith (startUid )) {
741811
812+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
813+ logIgnoredUid (uidIter .term ().utf8ToString ());
814+ BytesRef next = uidIter .next ();
815+ if (next == null ) {
816+ uidIter = null ;
817+ }
818+ continue ;
819+ }
820+
742821 if (usedHistory ) {
743822 // Allow for forced reindex. For history based reindex the trailing terms
744823 // correspond to the files that have not changed. Such files might need to be re-indexed
@@ -1526,68 +1605,55 @@ void indexDown(File dir, String parent, IndexDownArgs args) throws IOException {
15261605 * @param path path of the file argument relative to source root (with leading slash)
15271606 * @throws IOException on error
15281607 */
1529- private void processFileIncremental (IndexDownArgs args , File file , String path ) throws IOException {
1530- if (uidIter != null ) {
1531- path = Util .fixPathIfWindows (path );
1532- // Traverse terms until reaching one that matches the path of given file.
1533- while (uidIter != null && uidIter .term () != null
1534- && uidIter .term ().compareTo (emptyBR ) != 0
1535- && Util .uid2url (uidIter .term ().utf8ToString ()).compareTo (path ) < 0 ) {
1536-
1537- // A file that was not changed.
1538- /*
1539- * Possibly short-circuit to force reindexing of prior-version indexes.
1540- */
1541- String termPath = Util .uid2url (uidIter .term ().utf8ToString ());
1542- File termFile = new File (RuntimeEnvironment .getInstance ().getSourceRootFile (), termPath );
1543- boolean matchOK = (isWithDirectoryCounts || isCountingDeltas ) &&
1544- checkSettings (termFile , termPath );
1545- if (!matchOK ) {
1546- removeFile (false );
1608+ @ VisibleForTesting
1609+ void processFileIncremental (IndexDownArgs args , File file , String path ) throws IOException {
1610+ final boolean fileExists = file .exists ();
15471611
1548- args .curCount ++;
1549- args .works .add (new IndexFileWork (termFile , termPath ));
1550- }
1612+ path = Util .fixPathIfWindows (path );
1613+ // Traverse terms until reaching document beyond path of given file.
1614+ while (uidIter != null && uidIter .term () != null
1615+ && uidIter .term ().compareTo (emptyBR ) != 0
1616+ && Util .uid2url (uidIter .term ().utf8ToString ()).compareTo (path ) <= 0 ) {
15511617
1618+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1619+ logIgnoredUid (uidIter .term ().utf8ToString ());
15521620 BytesRef next = uidIter .next ();
15531621 if (next == null ) {
15541622 uidIter = null ;
15551623 }
1624+ continue ;
15561625 }
15571626
1558- if (uidIter != null && uidIter .term () != null
1559- && Util .uid2url (uidIter .term ().utf8ToString ()).equals (path )) {
1560- /*
1561- * At this point we know that the file has corresponding term in the index
1562- * and has changed in some way. Either it was deleted or it was changed.
1563- */
1564- if (!file .exists ()) {
1565- removeFile (true );
1566- } else {
1627+ /*
1628+ * Possibly short-circuit to force reindexing of prior-version indexes.
1629+ */
1630+ String termPath = Util .uid2url (uidIter .term ().utf8ToString ());
1631+ if (!termPath .equals (path )) {
1632+ // A file that was not changed.
1633+ File termFile = new File (RuntimeEnvironment .getInstance ().getSourceRootFile (), termPath );
1634+ boolean matchOK = (isWithDirectoryCounts || isCountingDeltas ) &&
1635+ checkSettings (termFile , termPath );
1636+ if (!matchOK ) {
15671637 removeFile (false );
15681638
15691639 args .curCount ++;
1570- args .works .add (new IndexFileWork (file , path ));
1571- }
1572-
1573- BytesRef next = uidIter .next ();
1574- if (next == null ) {
1575- uidIter = null ;
1640+ args .works .add (new IndexFileWork (termFile , termPath ));
15761641 }
15771642 } else {
1578- // Potentially new file. A file might be added and then deleted,
1579- // so it is necessary to check its existence.
1580- if (file .exists ()) {
1581- args .curCount ++;
1582- args .works .add (new IndexFileWork (file , path ));
1583- }
1643+ removeFile (!fileExists );
15841644 }
1585- } else {
1586- if ( file . exists ()) {
1587- args . curCount ++;
1588- args . works . add ( new IndexFileWork ( file , path )) ;
1645+
1646+ BytesRef next = uidIter . next ();
1647+ if ( next == null ) {
1648+ uidIter = null ;
15891649 }
15901650 }
1651+
1652+ // The function would not be called if the file was not changed in some way.
1653+ if (fileExists ) {
1654+ args .curCount ++;
1655+ args .works .add (new IndexFileWork (file , path ));
1656+ }
15911657 }
15921658
15931659 /**
@@ -1597,7 +1663,8 @@ private void processFileIncremental(IndexDownArgs args, File file, String path)
15971663 * @param path path corresponding to the file parameter, relative to source root (with leading slash)
15981664 * @throws IOException on error
15991665 */
1600- private void processFile (IndexDownArgs args , File file , String path ) throws IOException {
1666+ @ VisibleForTesting
1667+ void processFile (IndexDownArgs args , File file , String path ) throws IOException {
16011668 if (uidIter != null ) {
16021669 path = Util .fixPathIfWindows (path );
16031670 String uid = Util .path2uid (path ,
@@ -1611,6 +1678,15 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
16111678 && uidIter .term ().compareTo (emptyBR ) != 0
16121679 && uidIter .term ().compareTo (buid ) < 0 ) {
16131680
1681+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1682+ logIgnoredUid (uidIter .term ().utf8ToString ());
1683+ BytesRef next = uidIter .next ();
1684+ if (next == null ) {
1685+ uidIter = null ;
1686+ }
1687+ continue ;
1688+ }
1689+
16141690 // If the term's path matches path of currently processed file,
16151691 // it is clear that the file has been modified and thus
16161692 // removeFile() will be followed by call to addFile() in indexParallel().
@@ -1628,6 +1704,14 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
16281704
16291705 // If the file was not modified, probably skip to the next one.
16301706 if (uidIter != null && uidIter .term () != null && uidIter .term ().bytesEquals (buid )) {
1707+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1708+ logIgnoredUid (uidIter .term ().utf8ToString ());
1709+ BytesRef next = uidIter .next ();
1710+ if (next == null ) {
1711+ uidIter = null ;
1712+ }
1713+ return ;
1714+ }
16311715
16321716 /*
16331717 * Possibly short-circuit to force reindexing of prior-version indexes.
@@ -2035,13 +2119,13 @@ private void finishWriting() throws IOException {
20352119 try {
20362120 writeAnalysisSettings ();
20372121
2038- LOGGER .log (Level .FINE , "preparing to commit changes to Lucene index" ); // TODO add info about which database
2122+ LOGGER .log (Level .FINE , "preparing to commit changes to {0}" , this );
20392123 writer .prepareCommit ();
20402124 hasPendingCommit = true ;
20412125
2126+ Statistics completerStat = new Statistics ();
20422127 int n = completer .complete ();
2043- // TODO: add elapsed
2044- LOGGER .log (Level .FINE , "completed {0} object(s)" , n );
2128+ completerStat .report (LOGGER , Level .FINE , String .format ("completed %d object(s)" , n ));
20452129
20462130 // Just before commit(), reset the `hasPendingCommit' flag,
20472131 // since after commit() is called, there is no need for
@@ -2202,4 +2286,13 @@ private boolean xrefExistsFor(String path) {
22022286 private static class AcceptSymlinkRet {
22032287 String localRelPath ;
22042288 }
2289+
2290+ @ Override
2291+ public String toString () {
2292+ if (this .project != null ) {
2293+ return "index database for project '" + this .project .getName () + "'" ;
2294+ }
2295+
2296+ return "global index database" ;
2297+ }
22052298}
0 commit comments