本文整理了Java中org.apache.tika.parser.Parser
类的一些代码示例,展示了Parser
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser
类的具体详情如下:
包路径:org.apache.tika.parser.Parser
类名称:Parser
[英]Tika parser interface.
[中]Tika解析器接口。
代码示例来源:origin: apache/tika
public static void useAutoDetectParser() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new AutoDetectParser();
parser.parse(stream, handler, metadata, context);
}
代码示例来源:origin: apache/tika
public boolean isSupported(TikaInputStream input) throws IOException {
MediaType type = detector.detect(input, new Metadata());
return parser.getSupportedTypes(new ParseContext()).contains(type);
}
代码示例来源:origin: javasoze/meaningfulweb
private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{
parser.parse(in, new DefaultHandler(), meta, new ParseContext());
String[] propnames = meta.names();
for (String propname : propnames){
String val = meta.get(propname);
ogmeta.put(propname, val);
}
}
代码示例来源:origin: jpotts/alfresco-api-java-examples
InputStream stream = new FileInputStream(file);
try {
Metadata metadata = new Metadata();
ContentHandler handler = new DefaultHandler();
Parser parser = new JpegParser();
ParseContext context = new ParseContext();
metadata.set(Metadata.CONTENT_TYPE, mimeType);
parser.parse(stream, handler, metadata, context);
String lat = metadata.get("geo:lat");
String lon = metadata.get("geo:long");
stream.close();
代码示例来源:origin: apache/cxf
return null;
final Metadata metadata = new Metadata();
metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString());
} else {
for (Parser p : parsers) {
if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
continue;
context = new ParseContext();
if (context.get(Parser.class) == null) {
context.set(Parser.class,
parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {
parser.parse(in, handler, metadata, context);
} else {
throw ex;
代码示例来源:origin: apache/tika
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(1, usedParsers.length);
assertEquals(DummyParser.class.getName(), usedParsers[0]);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
代码示例来源:origin: apache/tika
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
} else {
content.parse(zip, handler, metadata, context);
} else {
content.parse(zip, handler, metadata, context);
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
代码示例来源:origin: apache/tika
xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(
new BodyContentHandler(xhtml));
type = type.trim();
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".htm") ||
entry.getName().endsWith(".html") ||
entry.getName().endsWith(".xhtml")) {
content.parse(zip, childHandler, metadata, context);
代码示例来源:origin: gentics/mesh
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
return Single.create(sub -> {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try {
parser.parse(ins, handler, metadata, context);
Map<String, String> map = new HashMap<>();
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
map.put(name, metadata.get(name));
}
sub.onSuccess(map);
} catch (Exception e) {
sub.onError(e);
}
// ins.close();
});
}
代码示例来源:origin: org.apache.beam/beam-sdks-java-io-tika
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
ReadableFile file = c.element();
InputStream stream = Channels.newInputStream(file.open());
try (InputStream tikaStream = TikaInputStream.get(stream)) {
Parser parser =
tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Metadata tikaMetadata =
spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
if (spec.getContentTypeHint() != null) {
tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
}
String location = file.getMetadata().resourceId().toString();
ParseResult res;
ContentHandler tikaHandler = new ToTextContentHandler();
try {
parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
} catch (Exception e) {
res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
}
c.output(res);
}
}
}
代码示例来源:origin: apache/tika
MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
Set<MediaType> types = p.getSupportedTypes(context);
assertEquals(2, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("", handler.toString());
代码示例来源:origin: apache/tika
private void parsePage(byte[] byteObject, Parser htmlParser,
ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
InputStream stream = null;
Metadata metadata = new Metadata();
ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
try {
stream = new ByteArrayInputStream(byteObject);
htmlParser.parse(stream, handler, metadata, context);
} catch (SAXException e) {
throw new RuntimeException(e);
} catch (IOException e) {
// Pushback overflow from tagsoup
}
}
代码示例来源:origin: apache/tika
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
代码示例来源:origin: NGDATA/lilyproject
BodyContentHandler ch = new BodyContentHandler(woh);
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
if (blob.getName() != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
ParseContext parseContext = new ParseContext();
tikaParser.parse(is, ch, metadata, parseContext);
} catch (Throwable t) {
if (woh.isWriteLimitReached(t)) {
String text = ch.toString();
if (text.length() > 0) {
result.add(text);
代码示例来源:origin: apache/tika
String v = toString(obj, c.getType());
if (isRichText(c)) {
BodyContentHandler h = new BodyContentHandler();
Metadata m = new Metadata();
m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
try {
htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
h,
m, parseContext);
handler.characters(h.toString());
} catch (SAXException e) {
代码示例来源:origin: apache/tika
public static void main(String[] args) throws Exception {
ApplicationContext context = new ClassPathXmlApplicationContext(
new String[]{"org/apache/tika/example/spring.xml"});
Parser parser = context.getBean("tika", Parser.class);
parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)),
new WriteOutContentHandler(System.out), new Metadata(),
new ParseContext());
}
}
代码示例来源:origin: apache/tika
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
System.out.println("The MIME type (based on filename) is: ["
+ mimeRegistry.detect(null, metadata) + "]");
metadata.set(Metadata.CONTENT_TYPE, type.toString());
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, new ParseContext());
代码示例来源:origin: ViDA-NYU/ache
public ParsedData parse(InputStream stream, String fileName, String contentType) {
BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
Metadata metadata = createMetadata(fileName, contentType);
ParseContext context = new ParseContext();
try {
parser.parse(stream, textHandler, metadata, context);
Map<String, String> metadataMap = new HashMap<String, String>();
for (String propertyName : metadata.names()) {
metadataMap.put(propertyName, metadata.get(propertyName));
}
return new ParsedData(handler.toString(), metadataMap);
} catch (IOException | SAXException | TikaException e) {
logger.error("Failed to extract metadata using Tika.", e);
return null;
}
}
代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-parsers
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
}
}
}
代码示例来源:origin: apache/tika
context.set(Parser.class, decorator);
ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler();
long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument();
try {
getWrappedParser().parse(stream, localHandler, metadata, context);
} catch (SAXException e) {
boolean wlr = isWriteLimitReached(e);
throw e;
metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
} catch (Throwable e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace);
throw e;
} finally {
long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
parserState.recursiveParserWrapperHandler.endDocument();
内容来源于网络,如有侵权,请联系作者删除!