oracle · idodeclare · Apr 16, 2020 · Apr 17, 2020 · Apr 17, 2020 · Apr 17, 2020
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java
@@ -59,6 +59,7 @@
 public class FileAnalyzer extends AbstractAnalyzer {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class);
+    private static final String ANALYZER_LC = "analyzer";
 
     /**
      * @return {@code null} as there is no aligned language
@@ -134,10 +135,9 @@ protected FileAnalyzer(AnalyzerFactory factory,
     @Override
     public String getFileTypeName() {
         String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT);
-        String suffix = "analyzer";
 
-        if (name.endsWith(suffix)) {
-            return name.substring(0, name.length() - suffix.length());
+        if (name.endsWith(ANALYZER_LC)) {
+            return name.substring(0, name.length() - ANALYZER_LC.length());
         }
 
         return name;

diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java
@@ -19,7 +19,7 @@
 
 /*
  * Copyright (c) 2013, 2018 Oracle and/or its affiliates. All rights reserved.
- * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
+ * Portions Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>.
  */
 package org.opengrok.indexer.analysis;
 
@@ -52,6 +52,11 @@ public abstract class StreamSource {
      */
     public abstract InputStream getStream() throws IOException;
 
+    /**
+     * Gets a reportable identifier of the source.
+     */
+    public abstract String getSourceIdentifier();
+
     /**
      * Helper method that creates a {@code StreamSource} instance that
      * reads data from a file.
@@ -65,6 +70,11 @@ public static StreamSource fromFile(final File file) {
             public InputStream getStream() throws IOException {
                 return new BufferedInputStream(new FileInputStream(file));
             }
+
+            @Override
+            public String getSourceIdentifier() {
+                return file.getAbsolutePath();
+            }
         };
     }
 
@@ -82,6 +92,11 @@ public static StreamSource fromString(final String str) {
             public InputStream getStream() throws IOException {
                 return new ByteArrayInputStream(sbuf);
             }
+
+            @Override
+            public String getSourceIdentifier() {
+                return "String";
+            }
         };
     }
 }
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java
@@ -27,14 +27,16 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Writer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
 import org.apache.tools.bzip2.CBZip2InputStream;
 import org.opengrok.indexer.analysis.AbstractAnalyzer;
 import org.opengrok.indexer.analysis.AnalyzerFactory;
 import org.opengrok.indexer.analysis.AnalyzerGuru;
-import org.opengrok.indexer.analysis.FileAnalyzer;
 import org.opengrok.indexer.analysis.StreamSource;
+import org.opengrok.indexer.logger.LoggerFactory;
 import org.opengrok.indexer.search.QueryBuilder;
 
 /**
@@ -43,17 +45,9 @@
  * Created on September 22, 2005
  * @author Chandan
  */
-public class BZip2Analyzer extends FileAnalyzer {
-
-    private Genre g;
+public class BZip2Analyzer extends CompressedAnalyzer {
 
-    @Override
-    public Genre getGenre() {
-        if (g != null) {
-            return g;
-        }
-        return super.getGenre();
-    }
+    private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class);
 
     protected BZip2Analyzer(AnalyzerFactory factory) {
         super(factory);
@@ -71,11 +65,11 @@ public String getCtagsLang() {
      * Gets a version number to be used to tag processed documents so that
      * re-analysis can be re-done later if a stored version number is different
      * from the current implementation.
-     * @return 20180111_00
+     * @return 20200417_00
      */
     @Override
     protected int getSpecializedVersionNo() {
-        return 20180111_00; // Edit comment above too!
+        return 20200417_00; // Edit comment above too!
     }
 
     @Override
@@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
             try (InputStream in = bzSrc.getStream()) {
                 fa = AnalyzerGuru.getAnalyzer(in, newname);
             }
-            if (!(fa instanceof BZip2Analyzer)) {
-                if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
-                    this.g = Genre.XREFABLE;
-                } else {
-                    this.g = Genre.DATA;
-                }
-                fa.analyze(doc, bzSrc, xrefOut);
-                if (doc.get(QueryBuilder.T) != null) {
-                    doc.removeField(QueryBuilder.T);
-                    if (g == Genre.XREFABLE) {
-                        doc.add(new Field(QueryBuilder.T, g.typeName(),
-                                AnalyzerGuru.string_ft_stored_nanalyzed_norms));
-                    }
-                }
+            if (fa == null) {
+                this.g = Genre.DATA;
+                LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
+                //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
+            } else if (!(fa instanceof BZip2Analyzer)) {
+                analyzeUncompressed(doc, xrefOut, fa, bzSrc);
             }
         }
     }
@@ -126,6 +112,11 @@ public InputStream getStream() throws IOException {
                     throw new IOException("Not BZIP2 format");
                 }
             }
+
+            @Override
+            public String getSourceIdentifier() {
+                return src.getSourceIdentifier();
+            }
         };
     }    
 }
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017-2020, Chris Fraire <cfraire@me.com>.
+ */
+
+package org.opengrok.indexer.analysis.archive;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.opengrok.indexer.analysis.AbstractAnalyzer;
+import org.opengrok.indexer.analysis.AnalyzerFactory;
+import org.opengrok.indexer.analysis.AnalyzerGuru;
+import org.opengrok.indexer.analysis.FileAnalyzer;
+import org.opengrok.indexer.analysis.StreamSource;
+import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
+import org.opengrok.indexer.configuration.RuntimeEnvironment;
+import org.opengrok.indexer.logger.LoggerFactory;
+import org.opengrok.indexer.search.QueryBuilder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Writer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Represents a base for compressed formats (e.g. gzip or bzip2) but not for
+ * archive formats that have compression (e.g. Zip or Jar).
+ * @author Chandan
+ */
+public abstract class CompressedAnalyzer extends FileAnalyzer {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(CompressedAnalyzer.class);
+
+    private static final int CHUNK_SIZE = 8 * 1024;
+
+    protected Genre g;
+
+    @Override
+    public Genre getGenre() {
+        if (g != null) {
+            return g;
+        }
+        return super.getGenre();
+    }
+
+    protected CompressedAnalyzer(AnalyzerFactory factory) {
+        super(factory);
+    }
+
+    protected void analyzeUncompressed(
+            Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc)
+            throws IOException, InterruptedException {
+
+        if (fa.getGenre() == Genre.PLAIN) {
+            if (meetsHugeTextThreshold(compressedSrc)) {
+                String origFileTypeName = fa.getFileTypeName();
+                fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
+                g = Genre.DATA;
+                if (LOGGER.isLoggable(Level.WARNING)) {
+                    LOGGER.log(Level.WARNING, "{0} is compressed huge text: {1}",
+                            new Object[]{origFileTypeName, compressedSrc.getSourceIdentifier()});
+                }
+            } else {
+                g = Genre.XREFABLE;
+            }
+        } else if (fa.getGenre() == Genre.XREFABLE) {
+            g = Genre.XREFABLE;
+        } else {
+            g = Genre.DATA;
+        }
+
+        fa.analyze(doc, compressedSrc, xrefOut);
+        if (doc.get(QueryBuilder.T) != null) {
+            doc.removeField(QueryBuilder.T);
+        }
+        doc.add(new Field(QueryBuilder.T, g.typeName(),
+                AnalyzerGuru.string_ft_stored_nanalyzed_norms));
+    }
+
+    private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException {
+        RuntimeEnvironment env = RuntimeEnvironment.getInstance();
+        int hugeTextThresholdBytes = env.getHugeTextThresholdBytes();
+        if (Integer.MAX_VALUE == hugeTextThresholdBytes) {
+            // Don't bother decompressing to count if the limit is MAX_VALUE.
+            return false;
+        }
+
+        try (InputStream in = compressedSrc.getStream()) {
+            // Try skip first.
+            SkipResult result = meetsHugeTextThresholdBySkip(in, hugeTextThresholdBytes);
+            if (result.didMeet) {
+                return true;
+            }
+
+            // Even if some skipped, only read==-1 is a true indicator of EOF.
+            long bytesRead = result.bytesSkipped;
+            byte[] buf = new byte[CHUNK_SIZE];
+            long n;
+            while ((n = in.read(buf, 0, buf.length)) != -1) {
+                bytesRead += n;
+                if (bytesRead >= hugeTextThresholdBytes) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    private SkipResult meetsHugeTextThresholdBySkip(InputStream in, int hugeTextThresholdBytes) {
+        long bytesSkipped = 0;
+        long n;
+        try {
+            while ((n = in.skip(CHUNK_SIZE)) > 0) {
+                bytesSkipped += n;
+                if (bytesSkipped >= hugeTextThresholdBytes) {
+                    return new SkipResult(bytesSkipped, true);
+                }
+            }
+        } catch (IOException ignored) {
+            // Ignore and assume not capable of skip.
+        }
+        return new SkipResult(bytesSkipped, false);
+    }
+
+    private static class SkipResult {
+        final long bytesSkipped;
+        final boolean didMeet;
+
+        SkipResult(long bytesSkipped, boolean didMeet) {
+            this.bytesSkipped = bytesSkipped;
+            this.didMeet = didMeet;
+        }
+    }
+}