mirror of
https://github.com/jlengrand/quarkus.git
synced 2026-03-10 08:41:22 +00:00
Support for the parser configuration in the properties
This commit is contained in:
committed by
Guillaume Smet
parent
2364bb77a3
commit
71a36330db
@@ -0,0 +1,21 @@
|
||||
package io.quarkus.tika.deployment;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import io.quarkus.builder.item.SimpleBuildItem;
|
||||
import io.quarkus.tika.runtime.TikaParserParameter;
|
||||
|
||||
public final class TikaParsersConfigBuildItem extends SimpleBuildItem {
|
||||
|
||||
private final Map<String, List<TikaParserParameter>> parsersConfig;
|
||||
|
||||
public TikaParsersConfigBuildItem(Map<String, List<TikaParserParameter>> parsersConfig) {
|
||||
this.parsersConfig = parsersConfig;
|
||||
}
|
||||
|
||||
public Map<String, List<TikaParserParameter>> getConfiguration() {
|
||||
return parsersConfig;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -2,18 +2,19 @@ package io.quarkus.tika.deployment;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.detect.EncodingDetector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.eclipse.microprofile.config.ConfigProvider;
|
||||
|
||||
import io.quarkus.arc.deployment.BeanContainerBuildItem;
|
||||
import io.quarkus.deployment.Capabilities;
|
||||
@@ -28,7 +29,9 @@ import io.quarkus.deployment.builditem.nativeimage.NativeImageResourceBuildItem;
|
||||
import io.quarkus.deployment.builditem.nativeimage.RuntimeInitializedClassBuildItem;
|
||||
import io.quarkus.deployment.builditem.nativeimage.ServiceProviderBuildItem;
|
||||
import io.quarkus.deployment.util.ServiceUtil;
|
||||
import io.quarkus.tika.TikaParseException;
|
||||
import io.quarkus.tika.runtime.TikaConfiguration;
|
||||
import io.quarkus.tika.runtime.TikaParserParameter;
|
||||
import io.quarkus.tika.runtime.TikaRecorder;
|
||||
|
||||
public class TikaProcessor {
|
||||
@@ -51,8 +54,12 @@ public class TikaProcessor {
|
||||
|
||||
@BuildStep
|
||||
@Record(ExecutionTime.STATIC_INIT)
|
||||
void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder) throws Exception {
|
||||
recorder.initTikaParser(beanContainer.getValue(), config, getSupportedParserNames(config.parsers));
|
||||
TikaParsersConfigBuildItem initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder)
|
||||
throws Exception {
|
||||
Map<String, List<TikaParserParameter>> parsersConfig = getSupportedParserConfig(config.tikaConfigPath, config.parsers,
|
||||
config.parserOptions, config.parser);
|
||||
recorder.initTikaParser(beanContainer.getValue(), config, parsersConfig);
|
||||
return new TikaParsersConfigBuildItem(parsersConfig);
|
||||
}
|
||||
|
||||
@BuildStep
|
||||
@@ -95,9 +102,11 @@ public class TikaProcessor {
|
||||
}
|
||||
|
||||
@BuildStep
|
||||
public void registerTikaProviders(BuildProducer<ServiceProviderBuildItem> serviceProvider) throws Exception {
|
||||
public void registerTikaProviders(BuildProducer<ServiceProviderBuildItem> serviceProvider,
|
||||
TikaParsersConfigBuildItem parserConfigItem) throws Exception {
|
||||
serviceProvider.produce(
|
||||
new ServiceProviderBuildItem(Parser.class.getName(), getSupportedParserNames(config.parsers)));
|
||||
new ServiceProviderBuildItem(Parser.class.getName(),
|
||||
new ArrayList<>(parserConfigItem.getConfiguration().keySet())));
|
||||
serviceProvider.produce(
|
||||
new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
|
||||
serviceProvider.produce(
|
||||
@@ -110,31 +119,95 @@ public class TikaProcessor {
|
||||
"META-INF/services/" + serviceProviderName));
|
||||
}
|
||||
|
||||
static List<String> getSupportedParserNames(Optional<String> requiredParsers) throws Exception {
|
||||
static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
|
||||
Optional<String> requiredParsers,
|
||||
Map<String, Map<String, String>> parserParamMaps,
|
||||
Map<String, String> parserAbbreviations) throws Exception {
|
||||
Predicate<String> pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p);
|
||||
List<String> providerNames = getProviderNames(Parser.class.getName());
|
||||
if (!requiredParsers.isPresent()) {
|
||||
return providerNames.stream().filter(pred).collect(Collectors.toList());
|
||||
if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) {
|
||||
return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(),
|
||||
p -> Collections.<TikaParserParameter> emptyList()));
|
||||
} else {
|
||||
List<String> abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim())
|
||||
.collect(Collectors.toList());
|
||||
Set<String> requiredParsersFullNames = abbreviations.stream()
|
||||
.map(p -> getParserNameFromConfig(p)).collect(Collectors.toSet());
|
||||
Map<String, String> fullNamesAndAbbreviations = abbreviations.stream()
|
||||
.collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));
|
||||
|
||||
return providerNames.stream().filter(pred).filter(p -> requiredParsersFullNames.contains(p))
|
||||
.collect(Collectors.toList());
|
||||
return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
|
||||
.collect(Collectors.toMap(Function.identity(),
|
||||
p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
|
||||
}
|
||||
}
|
||||
|
||||
private static String getParserNameFromConfig(String abbreviation) {
|
||||
static List<TikaParserParameter> getParserConfig(String parserName, Map<String, String> parserParamMap) {
|
||||
List<TikaParserParameter> parserParams = new LinkedList<>();
|
||||
if (parserParamMap != null) {
|
||||
for (Map.Entry<String, String> entry : parserParamMap.entrySet()) {
|
||||
String paramName = unhyphenate(entry.getKey());
|
||||
String paramType = getParserParamType(parserName, paramName);
|
||||
parserParams.add(new TikaParserParameter(paramName, entry.getValue(), paramType));
|
||||
}
|
||||
}
|
||||
return parserParams;
|
||||
}
|
||||
|
||||
private static String getParserNameFromConfig(String abbreviation, Map<String, String> parserAbbreviations) {
|
||||
if (PARSER_ABBREVIATIONS.containsKey(abbreviation)) {
|
||||
return PARSER_ABBREVIATIONS.get(abbreviation);
|
||||
}
|
||||
|
||||
if (parserAbbreviations.containsKey(abbreviation)) {
|
||||
return parserAbbreviations.get(abbreviation);
|
||||
}
|
||||
|
||||
throw new IllegalStateException("The custom abbreviation `" + abbreviation
|
||||
+ "` can not be resolved to a parser class name, please set a "
|
||||
+ "quarkus.tika.parser-name." + abbreviation + " property");
|
||||
}
|
||||
|
||||
// Convert a property name such as "sort-by-position" to "sortByPosition"
|
||||
private static String unhyphenate(String paramName) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String[] words = paramName.split("-");
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
sb.append(i > 0 ? capitalize(words[i]) : words[i]);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String capitalize(String paramName) {
|
||||
char[] chars = paramName.toCharArray();
|
||||
chars[0] = Character.toUpperCase(chars[0]);
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
// TODO: Remove the reflection code below once TikaConfig becomes capable
|
||||
// of loading the parameters without the type attribute: TIKA-2944
|
||||
|
||||
private static Class<?> loadParserClass(String parserName) {
|
||||
try {
|
||||
return ConfigProvider.getConfig().getValue(abbreviation, String.class);
|
||||
} catch (NoSuchElementException ex) {
|
||||
throw new IllegalStateException("The custom abbreviation " + abbreviation
|
||||
+ " can not be resolved to a parser class name");
|
||||
return TikaProcessor.class.getClassLoader().loadClass(parserName);
|
||||
} catch (Throwable t) {
|
||||
final String errorMessage = "Parser " + parserName + " can not be loaded";
|
||||
throw new TikaParseException(errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
private static String getParserParamType(String parserName, String paramName) {
|
||||
try {
|
||||
Class<?> parserClass = loadParserClass(parserName);
|
||||
String paramType = parserClass.getMethod("get" + capitalize(paramName), new Class[] {}).getReturnType()
|
||||
.getSimpleName().toLowerCase();
|
||||
if (paramType.equals(boolean.class.getSimpleName())) {
|
||||
// TikaConfig Param class does not recognize 'boolean', only 'bool'
|
||||
// This whole reflection code is temporary anyway
|
||||
paramType = "bool";
|
||||
}
|
||||
return paramType;
|
||||
} catch (Throwable t) {
|
||||
final String errorMessage = "Parser " + parserName + " has no " + paramName + " property";
|
||||
throw new TikaParseException(errorMessage);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
package io.quarkus.tika.deployment;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import org.eclipse.microprofile.config.spi.ConfigSource;
|
||||
|
||||
public class TestConfigSource implements ConfigSource {
|
||||
|
||||
@Override
|
||||
public Map<String, String> getProperties() {
|
||||
return Collections.singletonMap("opendoc", "org.apache.tika.parser.odf.OpenDocumentParser");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getValue(String propertyName) {
|
||||
return getProperties().get(propertyName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "test-source";
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,11 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.eclipse.microprofile.config.Config;
|
||||
import org.eclipse.microprofile.config.spi.ConfigProviderResolver;
|
||||
@@ -14,6 +17,7 @@ import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import io.quarkus.runtime.configuration.QuarkusConfigFactory;
|
||||
import io.quarkus.tika.runtime.TikaParserParameter;
|
||||
import io.smallrye.config.SmallRyeConfig;
|
||||
import io.smallrye.config.SmallRyeConfigBuilder;
|
||||
|
||||
@@ -45,27 +49,55 @@ public class TikaProcessorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSupportedParserNames() throws Exception {
|
||||
Optional<String> parserNames = Optional.of("pdf");
|
||||
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
|
||||
public void testPDFParserName() throws Exception {
|
||||
Set<String> names = getParserNames(null, "pdf");
|
||||
assertEquals(1, names.size());
|
||||
assertEquals("org.apache.tika.parser.pdf.PDFParser", names.get(0));
|
||||
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResolvableCustomAbbreviation() throws Exception {
|
||||
Optional<String> parserNames = Optional.of("pdf,opendoc");
|
||||
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
|
||||
public void testODFParserName() throws Exception {
|
||||
Set<String> names = getParserNames(null, "odf");
|
||||
assertEquals(1, names.size());
|
||||
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSupportedParserNames() throws Exception {
|
||||
Set<String> names = getParserNames(null, "pdf,odf");
|
||||
assertEquals(2, names.size());
|
||||
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
|
||||
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResolvableCustomAbbreviation() throws Exception {
|
||||
Set<String> names = getParserConfig(null, "pdf,opendoc", Collections.emptyMap(),
|
||||
Collections.singletonMap("opendoc",
|
||||
"org.apache.tika.parser.odf.OpenDocumentParser")).keySet();
|
||||
assertEquals(2, names.size());
|
||||
assertTrue(names.contains("org.apache.tika.parser.pdf.PDFParser"));
|
||||
assertTrue(names.contains("org.apache.tika.parser.odf.OpenDocumentParser"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdfParserConfig() throws Exception {
|
||||
Map<String, List<TikaParserParameter>> parserConfig = getParserConfig(null, "pdf",
|
||||
Collections.singletonMap("pdf",
|
||||
Collections.singletonMap("sort-by-position", "true")),
|
||||
Collections.emptyMap());
|
||||
assertEquals(1, parserConfig.size());
|
||||
|
||||
String pdfParserFullName = "org.apache.tika.parser.pdf.PDFParser";
|
||||
assertEquals(1, parserConfig.get(pdfParserFullName).size());
|
||||
assertEquals("sortByPosition", parserConfig.get(pdfParserFullName).get(0).getName());
|
||||
assertEquals("true", parserConfig.get(pdfParserFullName).get(0).getValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnresolvableCustomAbbreviation() throws Exception {
|
||||
Optional<String> parserNames = Optional.of("classparser");
|
||||
try {
|
||||
TikaProcessor.getSupportedParserNames(parserNames);
|
||||
getParserNames(null, "classparser");
|
||||
fail("'classparser' is not resolvable");
|
||||
} catch (IllegalStateException ex) {
|
||||
// expected
|
||||
@@ -74,8 +106,26 @@ public class TikaProcessorTest {
|
||||
|
||||
@Test
|
||||
public void testAllSupportedParserNames() throws Exception {
|
||||
Optional<String> parserNames = Optional.ofNullable(null);
|
||||
List<String> names = TikaProcessor.getSupportedParserNames(parserNames);
|
||||
assertEquals(69, getParserNames(null, null).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSupportedParserNamesWithTikaConfigPath() throws Exception {
|
||||
Set<String> names = getParserNames("tika-config.xml", "pdf");
|
||||
assertEquals(69, names.size());
|
||||
}
|
||||
|
||||
private Set<String> getParserNames(String tikaConfigPath, String parsers) throws Exception {
|
||||
return TikaProcessor.getSupportedParserConfig(
|
||||
Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers),
|
||||
Collections.emptyMap(), Collections.emptyMap()).keySet();
|
||||
}
|
||||
|
||||
private Map<String, List<TikaParserParameter>> getParserConfig(String tikaConfigPath, String parsers,
|
||||
Map<String, Map<String, String>> parserParamMaps,
|
||||
Map<String, String> parserAbbreviations) throws Exception {
|
||||
return TikaProcessor.getSupportedParserConfig(
|
||||
Optional.ofNullable(tikaConfigPath), Optional.ofNullable(parsers),
|
||||
parserParamMaps, parserAbbreviations);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
io.quarkus.tika.deployment.TestConfigSource
|
||||
@@ -1,5 +1,6 @@
|
||||
package io.quarkus.tika.runtime;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import io.quarkus.runtime.annotations.ConfigItem;
|
||||
@@ -26,16 +27,15 @@ public class TikaConfiguration {
|
||||
* property is recommended to achieve both optimizations.
|
||||
* <p>
|
||||
* Either the abbreviated or full parser class names can be used.
|
||||
* At the moment only PDF parser can be listed using a reserved 'pdf' abbreviation.
|
||||
* Only PDF and OpenDocument format parsers can be listed using the reserved 'pdf' and 'odf' abbreviations.
|
||||
* Custom class name abbreviations have to be used for all other parsers.
|
||||
* For example:
|
||||
*
|
||||
* <pre>
|
||||
* // Only PDF parser is required:
|
||||
* tika-parsers = pdf
|
||||
* // Only PDF and Java class parsers are required:
|
||||
* tika-parsers = pdf,classparser
|
||||
* classparser = org.apache.tika.parser.asm.ClassParser
|
||||
* quarkus.tika.parsers = pdf
|
||||
* // Only PDF and OpenDocument parsers are required:
|
||||
* quarkus.tika.parsers = pdf,odf
|
||||
* </pre>
|
||||
*
|
||||
* This property will have no effect if the `tikaConfigPath' property has been set.
|
||||
@@ -43,6 +43,28 @@ public class TikaConfiguration {
|
||||
@ConfigItem
|
||||
public Optional<String> parsers;
|
||||
|
||||
/**
|
||||
* Configuration of the individual parsers.
|
||||
* For example:
|
||||
*
|
||||
* <pre>
|
||||
* quarkus.tika.parsers = pdf,odf
|
||||
* quarkus.tika.parser-options.pdf.sort-by-position = true
|
||||
*/
|
||||
@ConfigItem
|
||||
public Map<String, Map<String, String>> parserOptions;
|
||||
|
||||
/**
|
||||
* Full parser class name for a given parser abbreviation.
|
||||
* For example:
|
||||
*
|
||||
* <pre>
|
||||
* quarkus.tika.parsers = classparser
|
||||
* quarkus.tika.parser.classparser = org.apache.tika.parser.asm.ClassParser
|
||||
*/
|
||||
@ConfigItem
|
||||
public Map<String, String> parser;
|
||||
|
||||
/**
|
||||
* Controls how the content of the embedded documents is parsed.
|
||||
* By default it is appended to the master document content.
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
package io.quarkus.tika.runtime;
|
||||
|
||||
public class TikaParserParameter {
|
||||
private String name;
|
||||
private String value;
|
||||
private String type;
|
||||
|
||||
public TikaParserParameter() {
|
||||
|
||||
}
|
||||
|
||||
public TikaParserParameter(String name, String value, String type) {
|
||||
this.name = name;
|
||||
this.value = value;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@ import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
@@ -18,16 +20,17 @@ import io.quarkus.tika.TikaParser;
|
||||
@Recorder
|
||||
public class TikaRecorder {
|
||||
|
||||
public void initTikaParser(BeanContainer container, TikaConfiguration config, List<String> supportedParserNames) {
|
||||
TikaParser parser = initializeParser(config, supportedParserNames);
|
||||
public void initTikaParser(BeanContainer container, TikaConfiguration config,
|
||||
Map<String, List<TikaParserParameter>> parserConfig) {
|
||||
TikaParser parser = initializeParser(config, parserConfig);
|
||||
TikaParserProducer producer = container.instance(TikaParserProducer.class);
|
||||
producer.initialize(parser);
|
||||
}
|
||||
|
||||
private TikaParser initializeParser(TikaConfiguration config, List<String> supportedParserNames) {
|
||||
private TikaParser initializeParser(TikaConfiguration config, Map<String, List<TikaParserParameter>> parserConfig) {
|
||||
TikaConfig tikaConfig = null;
|
||||
|
||||
try (InputStream stream = getTikaConfigStream(config, supportedParserNames)) {
|
||||
try (InputStream stream = getTikaConfigStream(config, parserConfig)) {
|
||||
tikaConfig = new TikaConfig(stream);
|
||||
} catch (Exception ex) {
|
||||
final String errorMessage = "Invalid tika-config.xml";
|
||||
@@ -44,7 +47,8 @@ public class TikaRecorder {
|
||||
return new TikaParser(nativeParser, config.appendEmbeddedContent);
|
||||
}
|
||||
|
||||
private static InputStream getTikaConfigStream(TikaConfiguration config, List<String> supportedParserNames) {
|
||||
private static InputStream getTikaConfigStream(TikaConfiguration config,
|
||||
Map<String, List<TikaParserParameter>> parserConfig) {
|
||||
// Load tika-config.xml resource
|
||||
InputStream is = null;
|
||||
if (config.tikaConfigPath.isPresent()) {
|
||||
@@ -56,20 +60,35 @@ public class TikaRecorder {
|
||||
throw new TikaParseException(errorMessage);
|
||||
}
|
||||
} else {
|
||||
is = generateTikaConfig(supportedParserNames);
|
||||
is = generateTikaConfig(parserConfig);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
private static InputStream generateTikaConfig(List<String> supportedParserNames) {
|
||||
private static InputStream generateTikaConfig(Map<String, List<TikaParserParameter>> parserConfig) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("<properties>");
|
||||
sb.append("<parsers>");
|
||||
for (String parserName : supportedParserNames) {
|
||||
sb.append("<parser class=\"").append(parserName).append("\"/>");
|
||||
for (Entry<String, List<TikaParserParameter>> parserEntry : parserConfig.entrySet()) {
|
||||
sb.append("<parser class=\"").append(parserEntry.getKey()).append("\">");
|
||||
if (!parserEntry.getValue().isEmpty()) {
|
||||
appendParserParameters(sb, parserEntry.getValue());
|
||||
}
|
||||
sb.append("</parser>");
|
||||
}
|
||||
sb.append("</parsers>");
|
||||
sb.append("</properties>");
|
||||
return new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
private static void appendParserParameters(StringBuilder sb, List<TikaParserParameter> parserParams) {
|
||||
sb.append("<params>");
|
||||
for (TikaParserParameter parserParam : parserParams) {
|
||||
sb.append("<param name=\"").append(parserParam.getName());
|
||||
sb.append("\" type=\"").append(parserParam.getType()).append("\">");
|
||||
sb.append(parserParam.getValue());
|
||||
sb.append("</param>");
|
||||
}
|
||||
sb.append("</params>");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,8 @@ import javax.ws.rs.core.MediaType;
|
||||
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.RecursiveParserWrapper;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
|
||||
import io.quarkus.tika.TikaContent;
|
||||
import io.quarkus.tika.TikaParser;
|
||||
@@ -18,7 +20,8 @@ import io.quarkus.tika.TikaParser;
|
||||
public class TikaEmdeddedContentResource {
|
||||
|
||||
// Avoiding the injection, otherwise the recorded tika-config.xml intended for TikaPdfInvoiceTest is used
|
||||
TikaParser parser = new TikaParser(new RecursiveParserWrapper(new AutoDetectParser(), true), false);
|
||||
TikaParser parser = new TikaParser(new RecursiveParserWrapper(
|
||||
new AutoDetectParser(new OfficeParser(), new PDFParser()), true), false);
|
||||
|
||||
@POST
|
||||
@Path("/outerText")
|
||||
|
||||
@@ -9,13 +9,17 @@ import javax.ws.rs.Produces;
|
||||
import javax.ws.rs.core.MediaType;
|
||||
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.csv.TextAndCSVParser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
|
||||
import io.quarkus.tika.TikaParser;
|
||||
|
||||
@Path("/parse")
|
||||
public class TikaParserResource {
|
||||
// Avoiding the injection, otherwise the recorded tika-config.xml intended for TikaPdfInvoiceTest is used
|
||||
TikaParser parser = new TikaParser(new AutoDetectParser(), true);
|
||||
TikaParser parser = new TikaParser(
|
||||
new AutoDetectParser(new PDFParser(), new OpenDocumentParser(), new TextAndCSVParser()), true);
|
||||
|
||||
@POST
|
||||
@Path("/text")
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
quarkus.tika.tika-config-path=tika-config.xml
|
||||
quarkus.tika.parsers=pdf
|
||||
quarkus.tika.parser-options.pdf.sort-by-position=true
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
<properties>
|
||||
<parsers>
|
||||
<parser class="org.apache.tika.parser.pdf.PDFParser">
|
||||
<params>
|
||||
<param name="sortByPosition" type="bool">true</param>
|
||||
</params>
|
||||
</parser>
|
||||
</parsers>
|
||||
</properties>
|
||||
Reference in New Issue
Block a user