Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<exclusions>
<exclusion>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-lite</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- needed by jackcess -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ public class OfficeParserConfig implements Serializable {
private boolean includeSlideMasterContent = true;
private boolean concatenatePhoneticRuns = true;

private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;

private boolean preferAlternateContentChoice = true;

private boolean writeSelectHeadersInBody = false;
Expand Down Expand Up @@ -142,40 +139,6 @@ public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
this.includeHeadersAndFooters = includeHeadersAndFooters;
}

public boolean isUseSAXDocxExtractor() {
return useSAXDocxExtractor;
}

/**
* Use the experimental SAX-based streaming DOCX parser?
* If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
* the new experimental parser will be used.
* <p/>
* Default: <code>false</code> (classic DOM parser)
*
* @param useSAXDocxExtractor
*/
public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
this.useSAXDocxExtractor = useSAXDocxExtractor;
}

public boolean isUseSAXPptxExtractor() {
return useSAXPptxExtractor;
}

/**
* Use the experimental SAX-based streaming DOCX parser?
* If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
* the new experimental parser will be used.
* <p/>
* Default: <code>false</code> (classic DOM parser)
*
* @param useSAXPptxExtractor
*/
public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
this.useSAXPptxExtractor = useSAXPptxExtractor;
}

/**
* In OOXML, {@code mc:AlternateContent} wraps {@code mc:Choice} (newer/richer
* rendering, e.g. DrawingML text boxes) and {@code mc:Fallback} (degraded VML
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
import java.util.Map;
import java.util.Set;

import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
Expand All @@ -45,7 +43,6 @@
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
Expand Down Expand Up @@ -93,9 +90,14 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
static final String RELATION_ALTERNATE_FORMAT_CHUNK =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk";

private static final String PACK_OBJECT_REL_TYPE =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
private static final String OLE_OBJECT_REL_TYPE =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";

protected static final String[] EMBEDDED_RELATIONSHIPS =
new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART,
POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
RELATION_DIAGRAM_DATA};
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
Expand All @@ -104,30 +106,23 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private final EmbeddedDocumentExtractor embeddedExtractor;
private final ParseContext context;
protected OfficeParserConfig config;
protected POIXMLTextExtractor extractor;
protected OPCPackage opcPackage;

public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
public AbstractOOXMLExtractor(ParseContext context, OPCPackage opcPackage) {
this.context = context;
this.extractor = extractor;
this.opcPackage = opcPackage;
embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
this.config = context.get(OfficeParserConfig.class);
}

/**
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
*/
public POIXMLDocument getDocument() {
return (POIXMLDocument) extractor.getDocument();
}

/**
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
*/
public MetadataExtractor getMetadataExtractor() {
return new MetadataExtractor(extractor);
return new SAXBasedMetadataExtractor(opcPackage, context);
}

ParseContext getParseContext() {
Expand Down Expand Up @@ -173,7 +168,6 @@ protected String getJustFileName(String desc) {

private void handleThumbnail(ContentHandler handler, Metadata metadata) throws SAXException {
try {
OPCPackage opcPackage = extractor.getPackage();
for (PackageRelationship rel : opcPackage
.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
PackagePart tPart = opcPackage.getPart(rel);
Expand Down Expand Up @@ -274,7 +268,7 @@ private void handleEmbeddedPart(PackagePart source, PackageRelationship rel,
if (rel.getTargetMode() != TargetMode.INTERNAL) {
// External target - emit as external reference for security analysis
String type = rel.getRelationshipType();
if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
if (OLE_OBJECT_REL_TYPE.equals(type)) {
emitExternalRef(xhtml, "externalOleObject", targetURI.toString());
parentMetadata.set(Office.HAS_EXTERNAL_OLE_OBJECTS, true);
} else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) {
Expand All @@ -293,7 +287,7 @@ private void handleEmbeddedPart(PackagePart source, PackageRelationship rel,
}
EmbeddedPartMetadata embeddedPartMetadata = embeddedPartMetadataMap.get(rel.getId());
String type = rel.getRelationshipType();
if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) &&
if (OLE_OBJECT_REL_TYPE.equals(type) &&
TYPE_OLE_OBJECT.equals(target.getContentType())) {
handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata,
embeddedPartMetadata);
Expand All @@ -308,8 +302,8 @@ private void handleEmbeddedPart(PackagePart source, PackageRelationship rel,
}
} else if (RELATION_MEDIA.equals(type) || RELATION_VIDEO.equals(type) ||
RELATION_AUDIO.equals(type) ||
POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) ||
POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
PACK_OBJECT_REL_TYPE.equals(type) ||
OLE_OBJECT_REL_TYPE.equals(type)) {
handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
embeddedPartMetadata,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT);
Expand Down Expand Up @@ -566,7 +560,7 @@ protected Map<String, String> loadLinkedRelationships(PackagePart bodyPart,
Map<String, String> linkedRelationships = new HashMap<>();
try {
PackageRelationshipCollection prc =
bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
bodyPart.getRelationshipsByType(PackageRelationshipTypes.HYPERLINK_PART);
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
if (pr == null) {
Expand Down
Loading
Loading