Skip to content
This repository was archived by the owner on Jun 20, 2023. It is now read-only.

Commit 6b91e53

Browse files
committed
Add test documents from tika test suite
This patch adds a zip of about 200 files from tika's test suite, and we assert some content comes back from each. This is a good exercise of the various formats. I removed any huge files to try to keep size reasonable, but we want a bit of a variety so we know stuff is working. I fixed issues with the parser config by running this.
1 parent 997fac1 commit 6b91e53

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,6 @@ final class TikaImpl {
2929
new org.apache.tika.parser.odf.OpenDocumentParser(),
3030
new org.apache.tika.parser.iwork.IWorkPackageParser(),
3131
new org.apache.tika.parser.xml.DcXMLParser(),
32-
// images:
33-
new org.apache.tika.parser.image.BPGParser(),
34-
new org.apache.tika.parser.image.ImageParser(),
35-
new org.apache.tika.parser.image.TiffParser(),
36-
new org.apache.tika.parser.image.WebPParser(),
37-
new org.apache.tika.parser.jpeg.JpegParser(),
38-
// compression / packaging:
39-
new org.apache.tika.parser.pkg.CompressorParser(),
40-
new org.apache.tika.parser.pkg.PackageParser(),
4132
};
4233

4334
/** autodetector based on this subset */
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package org.elasticsearch.mapper.attachments;
2+
3+
/*
4+
* Licensed to Elasticsearch under one or more contributor
5+
* license agreements. See the NOTICE file distributed with
6+
* this work for additional information regarding copyright
7+
* ownership. Elasticsearch licenses this file to you under
8+
* the Apache License, Version 2.0 (the "License"); you may
9+
* not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing,
15+
* software distributed under the License is distributed on an
16+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
* KIND, either express or implied. See the License for the
18+
* specific language governing permissions and limitations
19+
* under the License.
20+
*/
21+
22+
import java.nio.file.DirectoryStream;
23+
import java.nio.file.Files;
24+
import java.nio.file.Path;
25+
26+
import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems;
27+
import org.apache.lucene.util.TestUtil;
28+
import org.apache.tika.metadata.Metadata;
29+
30+
import org.elasticsearch.test.ESTestCase;
31+
32+
/**
33+
* Evil test-coverage cheat, we parse a bunch of docs from tika
34+
* so that we have a nice grab-bag variety, and assert some content
35+
* comes back and no exception.
36+
*/
37+
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
38+
public class TikaDocTests extends ESTestCase {
39+
40+
/** some test files from tika test suite, zipped up */
41+
static final String TIKA_FILES = "/org/elasticsearch/index/mapper/attachment/test/tika-files.zip";
42+
43+
public void testFiles() throws Exception {
44+
Path tmp = createTempDir();
45+
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES), tmp);
46+
47+
try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
48+
for (Path doc : stream) {
49+
logger.debug("parsing: {}", doc);
50+
assertParseable(doc);
51+
}
52+
}
53+
}
54+
55+
void assertParseable(Path fileName) throws Exception {
56+
try {
57+
byte bytes[] = Files.readAllBytes(fileName);
58+
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
59+
assertNotNull(parsedContent);
60+
assertFalse(parsedContent.isEmpty());
61+
logger.debug("extracted content: {}", parsedContent);
62+
} catch (Throwable e) {
63+
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
64+
}
65+
}
66+
}
Binary file not shown.

0 commit comments

Comments
 (0)