我在java 17中序列化一个xml文档。我把这个文档存储在一个文件中,假设我以后能够反序列化它。不幸的是,一旦我试图读取它,我得到一个SAXParseException。其中一个字符不是valid character in XML。我希望一个由java序列化的文档可以由java反序列化。
我只剩下两个问题
1.如何读取先前生成的包含无效字符的文件?
1.如何防止新生成的文件包含无效字符?
最小可重现示例:
package example;
import static java.nio.charset.StandardCharsets.US_ASCII;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class Scratch {
public static void main(String[] args) throws Exception {
// u001b = escape and is not allowed in xml 1.0
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
Element element = document.createElement("tag");
element.setAttribute("attribute", "\u001b");
document.appendChild(element);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
System.out.println("transformerFactory.getClass() = " + transformerFactory.getClass());
// transformerFactory.getClass() = class org.apache.xalan.processor.TransformerFactoryImpl
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, US_ASCII.name());
transformer.transform(new DOMSource(document), new StreamResult(bos));
String xmlString = bos.toString(US_ASCII);
System.out.println("xmlString = " + xmlString);
// xmlString = <?xml version="1.0" encoding="US-ASCII"?><tag attribute=""/>
DocumentBuilder domDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
domDocumentBuilder.parse(new ByteArrayInputStream(xmlString.getBytes(US_ASCII)));
// [Fatal Error] :1:63: Character reference "" is an invalid XML character.
// Exception in thread "main" org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 63; Character reference "" is an invalid XML character.
// at java.xml/com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:262)
// at java.xml/com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:342)
// at java.xml/javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:122)
// at be.mips.cyberlab.business.Scratch.main(Scratch.java:40)
}
}
Micheal Key建议“使用一个检查内容的API,或者在传递给API之前自己检查内容”,所以我打算尝试一下。它可以工作,但我不喜欢遍历整个树只是为了序列化它。
package example;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.w3c.dom.Node.CDATA_SECTION_NODE;
import static org.w3c.dom.Node.COMMENT_NODE;
import static org.w3c.dom.Node.ELEMENT_NODE;
import static org.w3c.dom.Node.TEXT_NODE;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
class Scratch {
private static final Pattern VALID_CHARACTERS = Pattern.compile(
"[\\u0009\\u000a\\u000d\\u0020-\\uD7FF\\uE000-\\uFFFD]+");
private static final Pattern INVALID_XML_CHARACTERS = Pattern.compile(
"(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F-\\x9F\\uFEFF\\uFFFE\\uFFFF]");
private static final String REPLACEMENT_CHARACTER = "\uFFFD";
public static void main(String[] args) throws Exception {
// u001b = escape and is not allowed in xml 1.0
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
Element element = document.createElement("tag");
element.setAttribute("attribute", "\u001b");
document.appendChild(element);
stripInvalidXmlCharacters(document);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
System.out.println("transformerFactory.getClass() = " + transformerFactory.getClass());
// transformerFactory.getClass() = class org.apache.xalan.processor.TransformerFactoryImpl
final Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, US_ASCII.name());
transformer.transform(new DOMSource(document), new StreamResult(bos));
String xmlString = bos.toString(US_ASCII);
System.out.println("xmlString = " + xmlString);
// xmlString = <?xml version="1.0" encoding="US-ASCII"?><tag attribute="�"/>
DocumentBuilder domDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
domDocumentBuilder.parse(new ByteArrayInputStream(xmlString.getBytes(US_ASCII)));
}
private static void stripInvalidXmlCharacters(Node node) {
NamedNodeMap attributes = node.getAttributes();
if (attributes != null) {
int length = attributes.getLength();
for (int i = 0; i < length; i++) {
Node attribute = attributes.item(i);
attribute.setNodeValue(stripInvalidXmlCharacters(attribute.getNodeValue()));
}
}
for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
switch (child.getNodeType()) {
case ELEMENT_NODE -> stripInvalidXmlCharacters(child);
case TEXT_NODE, CDATA_SECTION_NODE, COMMENT_NODE ->
child.setNodeValue(stripInvalidXmlCharacters(child.getNodeValue()));
}
}
}
private static String stripInvalidXmlCharacters(String input) {
if (input == null || input.isEmpty() || VALID_CHARACTERS.matcher(input).matches()) {
return input;
}
Matcher m = INVALID_XML_CHARACTERS.matcher(input);
StringBuilder sb = new StringBuilder();
while (m.find()) {
m.appendReplacement(sb, REPLACEMENT_CHARACTER);
}
m.appendTail(sb);
return sb.toString();
}
}
1条答案
按热度按时间ig9co6j11#
如何读取先前生成的包含无效字符的文件?
您必须使用一些不支持XML的进程来修复它们,而不能使用XML工具来读取它们。
如何防止新生成的文件包含无效字符?
您可以使用检查内容的API,或者在传递给API之前自己检查内容。
您正在使用的Xalan序列化器是作为XSLT转换器的后端设计的,它很可能假设XSLT处理器正在向它传递有效的内容。Saxon序列化器在架构上是相同的,它也做了同样的假设。