From 88fdf5b7c4da1ed9acb4d5711ef771566edcc39b Mon Sep 17 00:00:00 2001 From: Koval Dmitry Date: Sun, 14 Jan 2024 21:02:55 +0300 Subject: [PATCH v2] Added support of XML_PARSE_HUGE flag for XML documents 1. PostgreSQL uses functions of libxml2 library without XML_PARSE_HUGE flag. But in practice, the 10 MB limit is too small. Using libxml2 library functions with support of XML_PARSE_HUGE flag increases maximum size allowed for a single text node from 10.000.000 to 1.000.000.000 (see XML_MAX_TEXT_LENGTH macro) which in most cases solves the problem with insufficient memory. 2. xmlParseBalancedChunkMemory function (haven't argument for pass XML_PARSE_HUGE flag) was replaced to xmlNewNode+xmlParseInNodeContext functions (create a fake node and pass the XML_PARSE_HUGE flag to xmlParseInNodeContext function). 3. Replaced the xmlParseMemory and xmlSubstituteEntitiesDefault functions, which marked as deprecated [1]. [1] https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-parser.html --- contrib/xml2/xpath.c | 10 ++++++---- contrib/xml2/xslt_proc.c | 10 ++++++---- src/backend/utils/adt/xml.c | 35 +++++++++++++++++++++++++++-------- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c index a692dc6be8..a2cec95f3f 100644 --- a/contrib/xml2/xpath.c +++ b/contrib/xml2/xpath.c @@ -74,7 +74,6 @@ pgxml_parser_init(PgXmlStrictness strictness) /* Initialize libxml */ xmlInitParser(); - xmlSubstituteEntitiesDefault(1); xmlLoadExtDtdDefaultValue = 1; return xmlerrcxt; @@ -380,8 +379,9 @@ pgxml_xpath(text *document, xmlChar *xpath, xpath_workspace *workspace) PG_TRY(); { - workspace->doctree = xmlParseMemory((char *) VARDATA_ANY(document), - docsize); + workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), + docsize, NULL, NULL, + XML_PARSE_HUGE | XML_PARSE_NOENT); if (workspace->doctree != NULL) { workspace->ctxt = xmlXPathNewContext(workspace->doctree); @@ -624,7 +624,9 @@ xpath_table(PG_FUNCTION_ARGS) /* Parse the document */ if (xmldoc) - doctree = xmlParseMemory(xmldoc, strlen(xmldoc)); + doctree = xmlReadMemory(xmldoc, strlen(xmldoc), + NULL, NULL, + XML_PARSE_HUGE | XML_PARSE_NOENT); else /* treat NULL as not well-formed */ doctree = NULL; diff --git a/contrib/xml2/xslt_proc.c b/contrib/xml2/xslt_proc.c index 2189bca86f..9cbc05db1a 100644 --- a/contrib/xml2/xslt_proc.c +++ b/contrib/xml2/xslt_proc.c @@ -85,16 +85,18 @@ xslt_process(PG_FUNCTION_ARGS) bool xslt_sec_prefs_error; /* Parse document */ - doctree = xmlParseMemory((char *) VARDATA_ANY(doct), - VARSIZE_ANY_EXHDR(doct)); + doctree = xmlReadMemory((char *) VARDATA_ANY(doct), + VARSIZE_ANY_EXHDR(doct), NULL, NULL, + XML_PARSE_HUGE | XML_PARSE_NOENT); if (doctree == NULL) xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, "error parsing XML document"); /* Same for stylesheet */ - ssdoc = xmlParseMemory((char *) VARDATA_ANY(ssheet), - VARSIZE_ANY_EXHDR(ssheet)); + ssdoc = xmlReadMemory((char *) VARDATA_ANY(ssheet), + VARSIZE_ANY_EXHDR(ssheet), NULL, NULL, + XML_PARSE_HUGE | XML_PARSE_NOENT); if (ssdoc == NULL) xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 191dd2d1e2..9402e645f7 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1688,7 +1688,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and the input is not an XML document, the list - * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned + * of parsed nodes from the xmlParseInNodeContext call will be returned * to *parsed_nodes. * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1795,7 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, "UTF-8", - XML_PARSE_NOENT | XML_PARSE_DTDATTR + XML_PARSE_NOENT | XML_PARSE_DTDATTR | XML_PARSE_HUGE | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS)); if (doc == NULL || xmlerrcxt->err_occurred) { @@ -1828,10 +1828,29 @@ xml_parse(text *data, XmlOptionType xmloption_arg, /* allow empty content */ if (*(utf8string + count)) { - res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, - utf8string + count, - parsed_nodes); - if (res_code != 0 || xmlerrcxt->err_occurred) + const char *data; + xmlNodePtr root; + xmlParserErrors xml_error; + xmlNodePtr lst; + + data = (const char *) (utf8string + count); + + /* + * Create a fake root node. The xmlNewDoc function creates an + * XML document without any nodes. But we need one to call the + * xmlParseInNodeContext function. + */ + root = xmlNewNode(NULL, (const xmlChar *) "content-root"); + if (root == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xml node"); + xmlDocSetRootElement(doc, root); + + /* Try to parse string with using root node context. */ + xml_error = xmlParseInNodeContext(root, data, strlen(data), + XML_PARSE_HUGE, + parsed_nodes ? parsed_nodes : &lst); + if (xml_error != XML_ERR_OK || xmlerrcxt->err_occurred) { xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, @@ -4344,7 +4363,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len, - len - xmldecl_len, NULL, NULL, 0); + len - xmldecl_len, NULL, NULL, XML_PARSE_HUGE); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); @@ -4675,7 +4694,7 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value) PG_TRY(); { - doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0); + doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, XML_PARSE_HUGE); if (doc == NULL || xtCxt->xmlerrcxt->err_occurred) xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); -- 2.40.1.windows.1