Support for the parser configuration in the properties

This commit is contained in:
Sergey Beryozkin
2019-09-25 12:10:26 +01:00
committed by Guillaume Smet
parent 2364bb77a3
commit 71a36330db
12 changed files with 280 additions and 80 deletions

View File

@@ -0,0 +1,21 @@
package io.quarkus.tika.deployment;
import java.util.List;
import java.util.Map;
import io.quarkus.builder.item.SimpleBuildItem;
import io.quarkus.tika.runtime.TikaParserParameter;
public final class TikaParsersConfigBuildItem extends SimpleBuildItem {
private final Map<String, List<TikaParserParameter>> parsersConfig;
public TikaParsersConfigBuildItem(Map<String, List<TikaParserParameter>> parsersConfig) {
this.parsersConfig = parsersConfig;
}
public Map<String, List<TikaParserParameter>> getConfiguration() {
return parsersConfig;
}
}

View File

@@ -2,18 +2,19 @@ package io.quarkus.tika.deployment;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.parser.Parser;
import org.eclipse.microprofile.config.ConfigProvider;
import io.quarkus.arc.deployment.BeanContainerBuildItem;
import io.quarkus.deployment.Capabilities;
@@ -28,7 +29,9 @@ import io.quarkus.deployment.builditem.nativeimage.NativeImageResourceBuildItem;
import io.quarkus.deployment.builditem.nativeimage.RuntimeInitializedClassBuildItem;
import io.quarkus.deployment.builditem.nativeimage.ServiceProviderBuildItem;
import io.quarkus.deployment.util.ServiceUtil;
import io.quarkus.tika.TikaParseException;
import io.quarkus.tika.runtime.TikaConfiguration;
import io.quarkus.tika.runtime.TikaParserParameter;
import io.quarkus.tika.runtime.TikaRecorder;
public class TikaProcessor {
@@ -51,8 +54,12 @@ public class TikaProcessor {
@BuildStep
@Record(ExecutionTime.STATIC_INIT)
void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder) throws Exception {
recorder.initTikaParser(beanContainer.getValue(), config, getSupportedParserNames(config.parsers));
TikaParsersConfigBuildItem initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder)
throws Exception {
Map<String, List<TikaParserParameter>> parsersConfig = getSupportedParserConfig(config.tikaConfigPath, config.parsers,
config.parserOptions, config.parser);
recorder.initTikaParser(beanContainer.getValue(), config, parsersConfig);
return new TikaParsersConfigBuildItem(parsersConfig);
}
@BuildStep
@@ -95,9 +102,11 @@ public class TikaProcessor {
}
@BuildStep
public void registerTikaProviders(BuildProducer<ServiceProviderBuildItem> serviceProvider) throws Exception {
public void registerTikaProviders(BuildProducer<ServiceProviderBuildItem> serviceProvider,
TikaParsersConfigBuildItem parserConfigItem) throws Exception {
serviceProvider.produce(
new ServiceProviderBuildItem(Parser.class.getName(), getSupportedParserNames(config.parsers)));
new ServiceProviderBuildItem(Parser.class.getName(),
new ArrayList<>(parserConfigItem.getConfiguration().keySet())));
serviceProvider.produce(
new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
serviceProvider.produce(
@@ -110,31 +119,95 @@ public class TikaProcessor {
"META-INF/services/" + serviceProviderName));
}
static List<String> getSupportedParserNames(Optional<String> requiredParsers) throws Exception {
static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
Optional<String> requiredParsers,
Map<String, Map<String, String>> parserParamMaps,
Map<String, String> parserAbbreviations) throws Exception {
Predicate<String> pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p);
List<String> providerNames = getProviderNames(Parser.class.getName());
if (!requiredParsers.isPresent()) {
return providerNames.stream().filter(pred).collect(Collectors.toList());
if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) {
return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(),
p -> Collections.<TikaParserParameter> emptyList()));
} else {
List<String> abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim())
.collect(Collectors.toList());
Set<String> requiredParsersFullNames = abbreviations.stream()
.map(p -> getParserNameFromConfig(p)).collect(Collectors.toSet());
Map<String, String> fullNamesAndAbbreviations = abbreviations.stream()
.collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));
return providerNames.stream().filter(pred).filter(p -> requiredParsersFullNames.contains(p))
.collect(Collectors.toList());
return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
.collect(Collectors.toMap(Function.identity(),
p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
}
}
private static String getParserNameFromConfig(String abbreviation) {
static List<TikaParserParameter> getParserConfig(String parserName, Map<String, String> parserParamMap) {
List<TikaParserParameter> parserParams = new LinkedList<>();
if (parserParamMap != null) {
for (Map.Entry<String, String> entry : parserParamMap.entrySet()) {
String paramName = unhyphenate(entry.getKey());
String paramType = getParserParamType(parserName, paramName);
parserParams.add(new TikaParserParameter(paramName, entry.getValue(), paramType));
}
}
return parserParams;
}
private static String getParserNameFromConfig(String abbreviation, Map<String, String> parserAbbreviations) {
if (PARSER_ABBREVIATIONS.containsKey(abbreviation)) {
return PARSER_ABBREVIATIONS.get(abbreviation);
}
if (parserAbbreviations.containsKey(abbreviation)) {
return parserAbbreviations.get(abbreviation);
}
throw new IllegalStateException("The custom abbreviation `" + abbreviation
+ "` can not be resolved to a parser class name, please set a "
+ "quarkus.tika.parser-name." + abbreviation + " property");
}
// Convert a property name such as "sort-by-position" to "sortByPosition"
private static String unhyphenate(String paramName) {
StringBuilder sb = new StringBuilder();
String[] words = paramName.split("-");
for (int i = 0; i < words.length; i++) {
sb.append(i > 0 ? capitalize(words[i]) : words[i]);
}
return sb.toString();
}
private static String capitalize(String paramName) {
char[] chars = paramName.toCharArray();
chars[0] = Character.toUpperCase(chars[0]);
return new String(chars);
}
// TODO: Remove the reflection code below once TikaConfig becomes capable
// of loading the parameters without the type attribute: TIKA-2944
private static Class<?> loadParserClass(String parserName) {
try {
return ConfigProvider.getConfig().getValue(abbreviation, String.class);
} catch (NoSuchElementException ex) {
throw new IllegalStateException("The custom abbreviation " + abbreviation
+ " can not be resolved to a parser class name");
return TikaProcessor.class.getClassLoader().loadClass(parserName);
} catch (Throwable t) {
final String errorMessage = "Parser " + parserName + " can not be loaded";
throw new TikaParseException(errorMessage);
}
}
private static String getParserParamType(String parserName, String paramName) {
try {
Class<?> parserClass = loadParserClass(parserName);
String paramType = parserClass.getMethod("get" + capitalize(paramName), new Class[] {}).getReturnType()
.getSimpleName().toLowerCase();
if (paramType.equals(boolean.class.getSimpleName())) {
// TikaConfig Param class does not recognize 'boolean', only 'bool'
// This whole reflection code is temporary anyway
paramType = "bool";
}
return paramType;
} catch (Throwable t) {
final String errorMessage = "Parser " + parserName + " has no " + paramName + " property";
throw new TikaParseException(errorMessage);
}
}
}

View File

@@ -1,24 +0,0 @@
package io.quarkus.tika.deployment;
import java.util.Collections;
import java.util.Map;
import org.eclipse.microprofile.config.spi.ConfigSource;
public class TestConfigSource implements ConfigSource {
@Override
public Map<String, String> getProperties() {
return Collections.singletonMap("opendoc", "org.apache.tika.parser.odf.OpenDocumentParser");
}
@Override
public String getValue(String propertyName) {
return getProperties().get(propertyName);
}
@Override
public String getName() {
return "test-source";
}
}

View File

@@ -4,8 +4,11 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.eclipse.microprofile.config.Config;
import org.eclipse.microprofile.config.spi.ConfigProviderResolver;
@@ -14,6 +17,7 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import io.quarkus.runtime.configuration.QuarkusConfigFactory;
import io.quarkus.tika.runtime.TikaParserParameter;
import io.smallrye.config.SmallRyeConfig;
import io.smallrye.config.SmallRyeConfigBuilder;
@@ -45,27 +49,55 @@ public class TikaProcessorTest {
}
@Test
public void testSupportedParserNames() throws Exception {
Optional<String> parserNames = Optional.of("pdf");
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
public void testPDFParserName() throws Exception {
Set<String> names = getParserNames(null, "pdf");
assertEquals(1, names.size());
assertEquals("org.apache.tika.parser.pdf.PDFParser", names.get(0));
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
}
@Test
public void testResolvableCustomAbbreviation() throws Exception {
Optional<String> parserNames = Optional.of("pdf,opendoc");
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
public void testODFParserName() throws Exception {
Set<String> names = getParserNames(null, "odf");
assertEquals(1, names.size());
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
}
@Test
public void testSupportedParserNames() throws Exception {
Set<String> names = getParserNames(null, "pdf,odf");
assertEquals(2, names.size());
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
}
@Test
public void testResolvableCustomAbbreviation() throws Exception {
Set<String> names = getParserConfig(null, "pdf,opendoc", Collections.emptyMap(),
Collections.singletonMap("opendoc",
"org.apache.tika.parser.odf.OpenDocumentParser")).keySet();
assertEquals(2, names.size());
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
}
@Test
public void testPdfParserConfig() throws Exception {
Map<String, List<TikaParserParameter>> parserConfig = getParserConfig(null, "pdf",
Collections.singletonMap("pdf",
Collections.singletonMap("sort-by-position", "true")),
Collections.emptyMap());
assertEquals(1, parserConfig.size());
String pdfParserFullName = "org.apache.tika.parser.pdf.PDFParser";
assertEquals(1, parserConfig.get(pdfParserFullName).size());
assertEquals("sortByPosition", parserConfig.get(pdfParserFullName).get(0).getName());
assertEquals("true", parserConfig.get(pdfParserFullName).get(0).getValue());
}
@Test
public void testUnresolvableCustomAbbreviation() throws Exception {
Optional<String> parserNames = Optional.of("classparser");
try {
TikaProcessor.getSupportedParserNames(parserNames);
getParserNames(null, "classparser");
fail("'classparser' is not resolvable");
} catch (IllegalStateException ex) {
// expected
@@ -74,8 +106,26 @@ public class TikaProcessorTest {
@Test
public void testAllSupportedParserNames() throws Exception {
Optional<String> parserNames = Optional.ofNullable(null);
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
assertEquals(69, getParserNames(null, null).size());
}
@Test
public void testSupportedParserNamesWithTikaConfigPath() throws Exception {
Set<String> names = getParserNames("tika-config.xml", "pdf");
assertEquals(69, names.size());
}
private Set<String> getParserNames(String tikaConfigPath, String parsers) throws Exception {
return TikaProcessor.getSupportedParserConfig(
Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers),
Collections.emptyMap(), Collections.emptyMap()).keySet();
}
private Map<String, List<TikaParserParameter>> getParserConfig(String tikaConfigPath, String parsers,
Map<String, Map<String, String>> parserParamMaps,
Map<String, String> parserAbbreviations) throws Exception {
return TikaProcessor.getSupportedParserConfig(
Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers),
parserParamMaps, parserAbbreviations);
}
}

View File

@@ -1,5 +1,6 @@
package io.quarkus.tika.runtime;
import java.util.Map;
import java.util.Optional;
import io.quarkus.runtime.annotations.ConfigItem;
@@ -26,16 +27,15 @@ public class TikaConfiguration {
* property is recommended to achieve both optimizations.
* <p>
* Either the abbreviated or full parser class names can be used.
* At the moment only PDF parser can be listed using a reserved 'pdf' abbreviation.
* Only PDF and OpenDocument format parsers can be listed using the reserved 'pdf' and 'odf' abbreviations.
* Custom class name abbreviations have to be used for all other parsers.
* For example:
*
* <pre>
* // Only PDF parser is required:
* tika-parsers = pdf
* // Only PDF and Java class parsers are required:
* tika-parsers = pdf,classparser
* classparser = org.apache.tika.parser.asm.ClassParser
* quarkus.tika.parsers = pdf
* // Only PDF and OpenDocument parsers are required:
* quarkus.tika.parsers = pdf,odf
* </pre>
*
* This property will have no effect if the `tikaConfigPath' property has been set.
@@ -43,6 +43,28 @@ public class TikaConfiguration {
@ConfigItem
public Optional<String> parsers;
/**
* Configuration of the individual parsers.
* For example:
*
* <pre>
* quarkus.tika.parsers = pdf,odf
* quarkus.tika.parser-options.pdf.sort-by-position = true
*/
@ConfigItem
public Map<String, Map<String, String>> parserOptions;
/**
* Full parser class name for a given parser abbreviation.
* For example:
*
* <pre>
* quarkus.tika.parsers = classparser
* quarkus.tika.parser.classparser = org.apache.tika.parser.asm.ClassParser
*/
@ConfigItem
public Map<String, String> parser;
/**
* Controls how the content of the embedded documents is parsed.
* By default it is appended to the master document content.

View File

@@ -0,0 +1,41 @@
package io.quarkus.tika.runtime;
public class TikaParserParameter {
private String name;
private String value;
private String type;
public TikaParserParameter() {
}
public TikaParserParameter(String name, String value, String type) {
this.name = name;
this.value = value;
this.type = type;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@@ -4,6 +4,8 @@ import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.AutoDetectParser;
@@ -18,16 +20,17 @@ import io.quarkus.tika.TikaParser;
@Recorder
public class TikaRecorder {
public void initTikaParser(BeanContainer container, TikaConfiguration config, List<String> supportedParserNames) {
TikaParser parser = initializeParser(config, supportedParserNames);
public void initTikaParser(BeanContainer container, TikaConfiguration config,
Map<String, List<TikaParserParameter>> parserConfig) {
TikaParser parser = initializeParser(config, parserConfig);
TikaParserProducer producer = container.instance(TikaParserProducer.class);
producer.initialize(parser);
}
private TikaParser initializeParser(TikaConfiguration config, List<String> supportedParserNames) {
private TikaParser initializeParser(TikaConfiguration config, Map<String, List<TikaParserParameter>> parserConfig) {
TikaConfig tikaConfig = null;
try (InputStream stream = getTikaConfigStream(config, supportedParserNames)) {
try (InputStream stream = getTikaConfigStream(config, parserConfig)) {
tikaConfig = new TikaConfig(stream);
} catch (Exception ex) {
final String errorMessage = "Invalid tika-config.xml";
@@ -44,7 +47,8 @@ public class TikaRecorder {
return new TikaParser(nativeParser, config.appendEmbeddedContent);
}
private static InputStream getTikaConfigStream(TikaConfiguration config, List<String> supportedParserNames) {
private static InputStream getTikaConfigStream(TikaConfiguration config,
Map<String, List<TikaParserParameter>> parserConfig) {
// Load tika-config.xml resource
InputStream is = null;
if (config.tikaConfigPath.isPresent()) {
@@ -56,20 +60,35 @@ public class TikaRecorder {
throw new TikaParseException(errorMessage);
}
} else {
is = generateTikaConfig(supportedParserNames);
is = generateTikaConfig(parserConfig);
}
return is;
}
private static InputStream generateTikaConfig(List<String> supportedParserNames) {
private static InputStream generateTikaConfig(Map<String, List<TikaParserParameter>> parserConfig) {
StringBuilder sb = new StringBuilder();
sb.append("<properties>");
sb.append("<parsers>");
for (String parserName : supportedParserNames) {
sb.append("<parser class=\"").append(parserName).append("\"/>");
for (Entry<String, List<TikaParserParameter>> parserEntry : parserConfig.entrySet()) {
sb.append("<parser class=\"").append(parserEntry.getKey()).append("\">");
if (!parserEntry.getValue().isEmpty()) {
appendParserParameters(sb, parserEntry.getValue());
}
sb.append("</parser>");
}
sb.append("</parsers>");
sb.append("</properties>");
return new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
}
private static void appendParserParameters(StringBuilder sb, List<TikaParserParameter> parserParams) {
sb.append("<params>");
for (TikaParserParameter parserParam : parserParams) {
sb.append("<param name=\"").append(parserParam.getName());
sb.append("\" type=\"").append(parserParam.getType()).append("\">");
sb.append(parserParam.getValue());
sb.append("</param>");
}
sb.append("</params>");
}
}

View File

@@ -10,6 +10,8 @@ import javax.ws.rs.core.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.pdf.PDFParser;
import io.quarkus.tika.TikaContent;
import io.quarkus.tika.TikaParser;
@@ -18,7 +20,8 @@ import io.quarkus.tika.TikaParser;
public class TikaEmdeddedContentResource {
// Avoiding the injection, otherwise the recorded tika-config.xml intended for TikaPdfInvoiceTest is used
TikaParser parser = new TikaParser(new RecursiveParserWrapper(new AutoDetectParser(), true), false);
TikaParser parser = new TikaParser(new RecursiveParserWrapper(
new AutoDetectParser(new OfficeParser(), new PDFParser()), true), false);
@POST
@Path("/outerText")

View File

@@ -9,13 +9,17 @@ import javax.ws.rs.Produces;
import javax.ws.rs.core.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.csv.TextAndCSVParser;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.apache.tika.parser.pdf.PDFParser;
import io.quarkus.tika.TikaParser;
@Path("/parse")
public class TikaParserResource {
// Avoiding the injection, otherwise the recorded tika-config.xml intended for TikaPdfInvoiceTest is used
TikaParser parser = new TikaParser(new AutoDetectParser(), true);
TikaParser parser = new TikaParser(
new AutoDetectParser(new PDFParser(), new OpenDocumentParser(), new TextAndCSVParser()), true);
@POST
@Path("/text")

View File

@@ -1 +1,2 @@
quarkus.tika.tika-config-path=tika-config.xml
quarkus.tika.parsers=pdf
quarkus.tika.parser-options.pdf.sort-by-position=true

View File

@@ -1,9 +0,0 @@
<properties>
<parsers>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<param name="sortByPosition" type="bool">true</param>
</params>
</parser>
</parsers>
</properties>