org.apache.tika.parser.Parser类的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(11.4k)|赞(0)|评价(0)|浏览(372)

本文整理了Java中org.apache.tika.parser.Parser类的一些代码示例,展示了Parser类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser类的具体详情如下:
包路径:org.apache.tika.parser.Parser
类名称:Parser

Parser介绍

[英]Tika parser interface.
[中]Tika解析器接口。

代码示例

代码示例来源:origin: apache/tika

public static void useAutoDetectParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new AutoDetectParser();
  parser.parse(stream, handler, metadata, context);
}

代码示例来源:origin: apache/tika

public boolean isSupported(TikaInputStream input) throws IOException {
  MediaType type = detector.detect(input, new Metadata());
  return parser.getSupportedTypes(new ParseContext()).contains(type);
}

代码示例来源:origin: javasoze/meaningfulweb

private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{
 parser.parse(in, new DefaultHandler(), meta, new ParseContext());
 String[] propnames = meta.names();
 for (String propname : propnames){
  String val = meta.get(propname);
  ogmeta.put(propname, val);
 }
}

代码示例来源:origin: jpotts/alfresco-api-java-examples

InputStream stream = new FileInputStream(file);
try {
  Metadata metadata = new Metadata();
  ContentHandler handler = new DefaultHandler();
  Parser parser = new JpegParser();
  ParseContext context = new ParseContext();
  metadata.set(Metadata.CONTENT_TYPE, mimeType);
  parser.parse(stream, handler, metadata, context);
  String lat = metadata.get("geo:lat");
  String lon = metadata.get("geo:long");
  stream.close();

代码示例来源:origin: apache/cxf

return null;
final Metadata metadata = new Metadata();
    metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString());
  } else {
    for (Parser p : parsers) {
      if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
        continue;
    context = new ParseContext();
  if (context.get(Parser.class) == null) {
    context.set(Parser.class,
          parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
    parser.parse(in, handler, metadata, context);
  } catch (Exception ex) {
      parser.parse(in, handler, metadata, context);
    } else {
      throw ex;

代码示例来源:origin: apache/tika

ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues("X-Parsed-By");
assertEquals(1, usedParsers.length);
assertEquals(DummyParser.class.getName(), usedParsers[0]);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());

代码示例来源:origin: apache/tika

metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
  meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
  if (content instanceof OpenDocumentContentParser) {
  } else {
    content.parse(zip, handler, metadata, context);
  } else {
    content.parse(zip, handler, metadata, context);
    EmbeddedDocumentExtractor embeddedDocumentExtractor =
        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    Metadata embeddedMetadata = new Metadata();
    embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());

代码示例来源:origin: apache/tika

xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(
   new BodyContentHandler(xhtml));
      type = type.trim();
    metadata.set(Metadata.CONTENT_TYPE, type);
  } else if (entry.getName().equals("metadata.xml")) {
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".opf")) {
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".htm") || 
          entry.getName().endsWith(".html") || 
        entry.getName().endsWith(".xhtml")) {
    content.parse(zip, childHandler, metadata, context);

代码示例来源:origin: gentics/mesh

@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
  return Single.create(sub -> {
    Parser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try {
      parser.parse(ins, handler, metadata, context);
      Map<String, String> map = new HashMap<>();
      String[] metadataNames = metadata.names();
      for (String name : metadataNames) {
        map.put(name, metadata.get(name));
      }
      sub.onSuccess(map);
    } catch (Exception e) {
      sub.onError(e);
    }
    // ins.close();
  });
}

代码示例来源:origin: org.apache.beam/beam-sdks-java-io-tika

@ProcessElement
 public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
   Parser parser =
     tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);
   ParseContext context = new ParseContext();
   context.set(Parser.class, parser);
   Metadata tikaMetadata =
     spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
   if (spec.getContentTypeHint() != null) {
    tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
   }
   String location = file.getMetadata().resourceId().toString();
   ParseResult res;
   ContentHandler tikaHandler = new ToTextContentHandler();
   try {
    parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
    res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
   } catch (Exception e) {
    res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
   }
   c.output(res);
  }
 }
}

代码示例来源:origin: apache/tika

MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
Set<MediaType> types = p.getSupportedTypes(context);
assertEquals(2, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
assertEquals("", handler.toString());

代码示例来源:origin: apache/tika

private void parsePage(byte[] byteObject, Parser htmlParser,
            ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
  InputStream stream = null;
  Metadata metadata = new Metadata();
  ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
  try {
    stream = new ByteArrayInputStream(byteObject);
    htmlParser.parse(stream, handler, metadata, context);
  } catch (SAXException e) {
    throw new RuntimeException(e);
  } catch (IOException e) {
    // Pushback overflow from tagsoup
  }
}

代码示例来源:origin: apache/tika

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
    throws IOException, SAXException, TikaException {
  TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
    _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);

代码示例来源:origin: NGDATA/lilyproject

BodyContentHandler ch = new BodyContentHandler(woh);
  Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
  if (blob.getName() != null) {
    metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
  ParseContext parseContext = new ParseContext();
  tikaParser.parse(is, ch, metadata, parseContext);
} catch (Throwable t) {
  if (woh.isWriteLimitReached(t)) {
String text = ch.toString();
if (text.length() > 0) {
  result.add(text);

代码示例来源:origin: apache/tika

String v = toString(obj, c.getType());
if (isRichText(c)) {
  BodyContentHandler h = new BodyContentHandler();
  Metadata m = new Metadata();
  m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
  try {
    htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
        h,
        m, parseContext);
    handler.characters(h.toString());
  } catch (SAXException e) {

代码示例来源:origin: apache/tika

public static void main(String[] args) throws Exception {
    ApplicationContext context = new ClassPathXmlApplicationContext(
        new String[]{"org/apache/tika/example/spring.xml"});
    Parser parser = context.getBean("tika", Parser.class);
    parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)),
        new WriteOutContentHandler(System.out), new Metadata(),
        new ParseContext());
  }
}

代码示例来源:origin: apache/tika

metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
System.out.println("The MIME type (based on filename) is: ["
    + mimeRegistry.detect(null, metadata) + "]");
metadata.set(Metadata.CONTENT_TYPE, type.toString());
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, new ParseContext());

代码示例来源:origin: ViDA-NYU/ache

public ParsedData parse(InputStream stream, String fileName, String contentType) {
  BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
  BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
  Metadata metadata = createMetadata(fileName, contentType);
  ParseContext context = new ParseContext();
  try {
    parser.parse(stream, textHandler, metadata, context);
    
    Map<String, String> metadataMap = new HashMap<String, String>();
    for (String propertyName : metadata.names()) {
      metadataMap.put(propertyName, metadata.get(propertyName));
    }
    
    return new ParsedData(handler.toString(), metadataMap);
    
  } catch (IOException | SAXException | TikaException e) {
    logger.error("Failed to extract metadata using Tika.", e);
    return null;
  }
}

代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-parsers

private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
                ParseContext context, EndDocumentShieldingContentHandler handler)
      throws IOException, SAXException, TikaException {
    if (entry == null) return;

    if (entry.getName().equals("mimetype")) {
      String type = IOUtils.toString(zip, UTF_8);
      metadata.set(Metadata.CONTENT_TYPE, type);
    } else if (entry.getName().equals(META_NAME)) {
      meta.parse(zip, new DefaultHandler(), metadata, context);
    } else if (entry.getName().endsWith("content.xml")) {
      if (content instanceof OpenDocumentContentParser) {
        ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
      } else {
        // Foreign content parser was set:
        content.parse(zip, handler, metadata, context);
      }
    } else if (entry.getName().endsWith("styles.xml")) {
      if (content instanceof OpenDocumentContentParser) {
        ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
      } else {
        // Foreign content parser was set:
        content.parse(zip, handler, metadata, context);
      }
    }
  }
}

代码示例来源:origin: apache/tika

context.set(Parser.class, decorator);
ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler();
long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument();
try {
  getWrappedParser().parse(stream, localHandler, metadata, context);
} catch (SAXException e) {
  boolean wlr = isWriteLimitReached(e);
    throw e;
  metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
} catch (Throwable e) {
  metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace);
  throw e;
} finally {
  long elapsedMillis = System.currentTimeMillis() - started;
  metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
  parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
  parserState.recursiveParserWrapperHandler.endDocument();

相关文章