Java在序列化xml文档时不过滤无效字符

s4n0splo  于 2023-03-21  发布在  Java
关注(0)|答案(1)|浏览(92)

我在java 17中序列化一个xml文档。我把这个文档存储在一个文件中,假设我以后能够反序列化它。不幸的是,一旦我试图读取它,我得到一个SAXParseException。其中一个字符不是valid character in XML。我希望一个由java序列化的文档可以由java反序列化。
我只剩下两个问题
1.如何读取先前生成的包含无效字符的文件?
1.如何防止新生成的文件包含无效字符?
最小可重现示例:

package example;

import static java.nio.charset.StandardCharsets.US_ASCII;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

public class Scratch {
    public static void main(String[] args) throws Exception {
        // u001b = escape and is not allowed in xml 1.0
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        Element element = document.createElement("tag");
        element.setAttribute("attribute", "\u001b");
        document.appendChild(element);

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        TransformerFactory transformerFactory = TransformerFactory.newInstance();
        System.out.println("transformerFactory.getClass() = " + transformerFactory.getClass());
        // transformerFactory.getClass() = class org.apache.xalan.processor.TransformerFactoryImpl
        Transformer transformer = transformerFactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.ENCODING, US_ASCII.name());
        transformer.transform(new DOMSource(document), new StreamResult(bos));
        String xmlString = bos.toString(US_ASCII);

        System.out.println("xmlString = " + xmlString);
        // xmlString = <?xml version="1.0" encoding="US-ASCII"?><tag attribute="&#27;"/>

        DocumentBuilder domDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        domDocumentBuilder.parse(new ByteArrayInputStream(xmlString.getBytes(US_ASCII)));
        // [Fatal Error] :1:63: Character reference "&#27" is an invalid XML character.
        //     Exception in thread "main" org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 63; Character reference "&#27" is an invalid XML character.
        //     at java.xml/com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:262)
        // at java.xml/com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:342)
        // at java.xml/javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:122)
        // at be.mips.cyberlab.business.Scratch.main(Scratch.java:40)
    }
}

Micheal Key建议“使用一个检查内容的API,或者在传递给API之前自己检查内容”,所以我打算尝试一下。它可以工作,但我不喜欢遍历整个树只是为了序列化它。

package example;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.w3c.dom.Node.CDATA_SECTION_NODE;
import static org.w3c.dom.Node.COMMENT_NODE;
import static org.w3c.dom.Node.ELEMENT_NODE;
import static org.w3c.dom.Node.TEXT_NODE;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

class Scratch {

    private static final Pattern VALID_CHARACTERS = Pattern.compile(
            "[\\u0009\\u000a\\u000d\\u0020-\\uD7FF\\uE000-\\uFFFD]+");
    private static final Pattern INVALID_XML_CHARACTERS = Pattern.compile(
            "(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F-\\x9F\\uFEFF\\uFFFE\\uFFFF]");
    private static final String REPLACEMENT_CHARACTER = "\uFFFD";

    public static void main(String[] args) throws Exception {
        // u001b = escape and is not allowed in xml 1.0
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        Element element = document.createElement("tag");
        element.setAttribute("attribute", "\u001b");
        document.appendChild(element);

        stripInvalidXmlCharacters(document);

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        TransformerFactory transformerFactory = TransformerFactory.newInstance();
        System.out.println("transformerFactory.getClass() = " + transformerFactory.getClass());
        // transformerFactory.getClass() = class org.apache.xalan.processor.TransformerFactoryImpl
        final Transformer transformer = transformerFactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.ENCODING, US_ASCII.name());
        transformer.transform(new DOMSource(document), new StreamResult(bos));
        String xmlString = bos.toString(US_ASCII);

        System.out.println("xmlString = " + xmlString);
        // xmlString = <?xml version="1.0" encoding="US-ASCII"?><tag attribute="&#65533;"/>

        DocumentBuilder domDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        domDocumentBuilder.parse(new ByteArrayInputStream(xmlString.getBytes(US_ASCII)));
    }

    private static void stripInvalidXmlCharacters(Node node) {
        NamedNodeMap attributes = node.getAttributes();
        if (attributes != null) {
            int length = attributes.getLength();
            for (int i = 0; i < length; i++) {
                Node attribute = attributes.item(i);
                attribute.setNodeValue(stripInvalidXmlCharacters(attribute.getNodeValue()));
            }
        }
        for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
            switch (child.getNodeType()) {
                case ELEMENT_NODE -> stripInvalidXmlCharacters(child);
                case TEXT_NODE, CDATA_SECTION_NODE, COMMENT_NODE ->
                        child.setNodeValue(stripInvalidXmlCharacters(child.getNodeValue()));
            }
        }
    }

    private static String stripInvalidXmlCharacters(String input) {
        if (input == null || input.isEmpty() || VALID_CHARACTERS.matcher(input).matches()) {
            return input;
        }
        Matcher m = INVALID_XML_CHARACTERS.matcher(input);
        StringBuilder sb = new StringBuilder();
        while (m.find()) {
            m.appendReplacement(sb, REPLACEMENT_CHARACTER);
        }
        m.appendTail(sb);
        return sb.toString();
    }
}
ig9co6j1

ig9co6j11#

如何读取先前生成的包含无效字符的文件?
您必须使用一些不支持XML的进程来修复它们,而不能使用XML工具来读取它们。
如何防止新生成的文件包含无效字符?
您可以使用检查内容的API,或者在传递给API之前自己检查内容。
您正在使用的Xalan序列化器是作为XSLT转换器的后端设计的,它很可能假设XSLT处理器正在向它传递有效的内容。Saxon序列化器在架构上是相同的,它也做了同样的假设。

相关问题