Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Huge text handling #3121

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
public class FileAnalyzer extends AbstractAnalyzer {

private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class);
private static final String ANALYZER_LC = "analyzer";

/**
* @return {@code null} as there is no aligned language
Expand Down Expand Up @@ -134,10 +135,9 @@ protected FileAnalyzer(AnalyzerFactory factory,
@Override
public String getFileTypeName() {
String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT);
String suffix = "analyzer";

if (name.endsWith(suffix)) {
return name.substring(0, name.length() - suffix.length());
if (name.endsWith(ANALYZER_LC)) {
return name.substring(0, name.length() - ANALYZER_LC.length());
}

return name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

/*
* Copyright (c) 2013, 2018 Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
* Portions Copyright (c) 2018, 2020, Chris Fraire <cfraire@me.com>.
*/
package org.opengrok.indexer.analysis;

Expand Down Expand Up @@ -52,6 +52,11 @@ public abstract class StreamSource {
*/
public abstract InputStream getStream() throws IOException;

/**
* Gets a reportable identifier of the source.
*/
public abstract String getSourceIdentifier();

/**
* Helper method that creates a {@code StreamSource} instance that
* reads data from a file.
Expand All @@ -65,6 +70,11 @@ public static StreamSource fromFile(final File file) {
public InputStream getStream() throws IOException {
return new BufferedInputStream(new FileInputStream(file));
}

@Override
public String getSourceIdentifier() {
return file.getAbsolutePath();
}
};
}

Expand All @@ -82,6 +92,11 @@ public static StreamSource fromString(final String str) {
public InputStream getStream() throws IOException {
return new ByteArrayInputStream(sbuf);
}

@Override
public String getSourceIdentifier() {
return "String";
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,16 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tools.bzip2.CBZip2InputStream;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.AnalyzerGuru;
import org.opengrok.indexer.analysis.FileAnalyzer;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.logger.LoggerFactory;
import org.opengrok.indexer.search.QueryBuilder;

/**
Expand All @@ -43,17 +45,9 @@
* Created on September 22, 2005
* @author Chandan
*/
public class BZip2Analyzer extends FileAnalyzer {

private Genre g;
public class BZip2Analyzer extends CompressedAnalyzer {

@Override
public Genre getGenre() {
if (g != null) {
return g;
}
return super.getGenre();
}
private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class);

protected BZip2Analyzer(AnalyzerFactory factory) {
super(factory);
Expand All @@ -71,11 +65,11 @@ public String getCtagsLang() {
* Gets a version number to be used to tag processed documents so that
* re-analysis can be re-done later if a stored version number is different
* from the current implementation.
* @return 20180111_00
* @return 20200417_00
*/
@Override
protected int getSpecializedVersionNo() {
return 20180111_00; // Edit comment above too!
return 20200417_00; // Edit comment above too!
}

@Override
Expand All @@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
try (InputStream in = bzSrc.getStream()) {
fa = AnalyzerGuru.getAnalyzer(in, newname);
}
if (!(fa instanceof BZip2Analyzer)) {
if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
this.g = Genre.XREFABLE;
} else {
this.g = Genre.DATA;
}
fa.analyze(doc, bzSrc, xrefOut);
if (doc.get(QueryBuilder.T) != null) {
doc.removeField(QueryBuilder.T);
if (g == Genre.XREFABLE) {
doc.add(new Field(QueryBuilder.T, g.typeName(),
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
}
}
if (fa == null) {
this.g = Genre.DATA;
LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
//TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
} else if (!(fa instanceof BZip2Analyzer)) {
analyzeUncompressed(doc, xrefOut, fa, bzSrc);
}
}
}
Expand All @@ -126,6 +112,11 @@ public InputStream getStream() throws IOException {
throw new IOException("Not BZIP2 format");
}
}

@Override
public String getSourceIdentifier() {
return src.getSourceIdentifier();
}
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2017-2020, Chris Fraire <cfraire@me.com>.
*/

package org.opengrok.indexer.analysis.archive;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.AnalyzerGuru;
import org.opengrok.indexer.analysis.FileAnalyzer;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
import org.opengrok.indexer.configuration.RuntimeEnvironment;
import org.opengrok.indexer.logger.LoggerFactory;
import org.opengrok.indexer.search.QueryBuilder;

import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* Represents a base for compressed formats (e.g. gzip or bzip2) but not for
* archive formats that have compression (e.g. Zip or Jar).
* @author Chandan
*/
public abstract class CompressedAnalyzer extends FileAnalyzer {

private static final Logger LOGGER = LoggerFactory.getLogger(CompressedAnalyzer.class);

private static final int CHUNK_SIZE = 8 * 1024;

protected Genre g;

@Override
public Genre getGenre() {
if (g != null) {
return g;
}
return super.getGenre();
}

protected CompressedAnalyzer(AnalyzerFactory factory) {
super(factory);
}

protected void analyzeUncompressed(
Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc)
throws IOException, InterruptedException {

if (fa.getGenre() == Genre.PLAIN) {
if (meetsHugeTextThreshold(compressedSrc)) {
String origFileTypeName = fa.getFileTypeName();
fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
g = Genre.DATA;
if (LOGGER.isLoggable(Level.WARNING)) {
LOGGER.log(Level.WARNING, "{0} is compressed huge text: {1}",
new Object[]{origFileTypeName, compressedSrc.getSourceIdentifier()});
}
} else {
g = Genre.XREFABLE;
}
} else if (fa.getGenre() == Genre.XREFABLE) {
g = Genre.XREFABLE;
} else {
g = Genre.DATA;
}

fa.analyze(doc, compressedSrc, xrefOut);
if (doc.get(QueryBuilder.T) != null) {
doc.removeField(QueryBuilder.T);
}
doc.add(new Field(QueryBuilder.T, g.typeName(),
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
}

private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException {
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
int hugeTextThresholdBytes = env.getHugeTextThresholdBytes();
if (Integer.MAX_VALUE == hugeTextThresholdBytes) {
// Don't bother decompressing to count if the limit is MAX_VALUE.
return false;
}

try (InputStream in = compressedSrc.getStream()) {
// Try skip first.
SkipResult result = meetsHugeTextThresholdBySkip(in, hugeTextThresholdBytes);
if (result.didMeet) {
return true;
}

// Even if some skipped, only read==-1 is a true indicator of EOF.
long bytesRead = result.bytesSkipped;
byte[] buf = new byte[CHUNK_SIZE];
long n;
while ((n = in.read(buf, 0, buf.length)) != -1) {
bytesRead += n;
if (bytesRead >= hugeTextThresholdBytes) {
return true;
}
}
}
return false;
}

private SkipResult meetsHugeTextThresholdBySkip(InputStream in, int hugeTextThresholdBytes) {
long bytesSkipped = 0;
long n;
try {
while ((n = in.skip(CHUNK_SIZE)) > 0) {
bytesSkipped += n;
if (bytesSkipped >= hugeTextThresholdBytes) {
return new SkipResult(bytesSkipped, true);
}
}
} catch (IOException ignored) {
// Ignore and assume not capable of skip.
}
return new SkipResult(bytesSkipped, false);
}

private static class SkipResult {
final long bytesSkipped;
final boolean didMeet;

SkipResult(long bytesSkipped, boolean didMeet) {
this.bytesSkipped = bytesSkipped;
this.didMeet = didMeet;
}
}
}