From 5fc86e5ec11b09841a7bd42eff5220196266ba76 Mon Sep 17 00:00:00 2001 From: David du Colombier Date: Wed, 18 Feb 2026 14:57:58 +0100 Subject: [PATCH 1/4] Add streaming xmlTextReader to oscap_source Add oscap_source_get_streaming_xmlTextReader() that creates an xmlTextReader directly from file contents or memory buffer without loading the full XML DOM first. For file-based sources, the file is read into a memory buffer and parsed with xmlReaderForMemory. For memory-based sources, the buffer is parsed directly. BZ2- compressed sources fall back to the existing DOM-based path. Also switch oscap_source_get_scap_type() and oscap_source_get_schema_version() to use the streaming reader, avoiding unnecessary DOM construction for document type detection and schema version extraction. --- src/source/oscap_source.c | 81 ++++++++++++++++++++++++++++++++-- src/source/oscap_source_priv.h | 28 +++++++++++- 2 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/source/oscap_source.c b/src/source/oscap_source.c index 2acbe88b27..f578991cd4 100644 --- a/src/source/oscap_source.c +++ b/src/source/oscap_source.c @@ -25,6 +25,7 @@ #endif #include +#include #include #ifdef OS_WINDOWS #include @@ -163,6 +164,15 @@ void oscap_source_free_xmlDoc(struct oscap_source *source) } } +void oscap_source_free_memory(struct oscap_source *source) +{ + if (source != NULL) { + free(source->origin.memory); + source->origin.memory = NULL; + source->origin.memory_size = 0; + } +} + /** * Returns human readable description of oscap_source origin */ @@ -187,17 +197,80 @@ xmlTextReader *oscap_source_get_xmlTextReader(struct oscap_source *source) return reader; } +xmlTextReader *oscap_source_get_streaming_xmlTextReader(struct oscap_source *source) +{ + if (source->xml.doc != NULL) { + return oscap_source_get_xmlTextReader(source); + } + + if (source->origin.memory != NULL) { + if (bz2_memory_is_bzip(source->origin.memory, source->origin.memory_size)) { + return oscap_source_get_xmlTextReader(source); + } + xmlTextReader *reader = xmlReaderForMemory(source->origin.memory, + source->origin.memory_size, NULL, NULL, 0); + if (reader == NULL) { + oscap_seterr(OSCAP_EFAMILY_XML, "Unable to create streaming xmlTextReader for %s", + oscap_source_readable_origin(source)); + oscap_setxmlerr(xmlGetLastError()); + } + return reader; + } + + if (source->origin.filepath != NULL) { + int fd = open(source->origin.filepath, O_RDONLY); + if (fd == -1) { + oscap_seterr(OSCAP_EFAMILY_XML, "Unable to open file for streaming xmlTextReader: %s", + oscap_source_readable_origin(source)); + return NULL; + } + if (bz2_fd_is_bzip(fd)) { + close(fd); + return oscap_source_get_xmlTextReader(source); + } + struct stat st; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + close(fd); + return oscap_source_get_xmlTextReader(source); + } + size_t file_size = (size_t)st.st_size; + source->origin.memory = malloc(file_size); + if (source->origin.memory == NULL) { + close(fd); + return oscap_source_get_xmlTextReader(source); + } + size_t total_read = 0; + while (total_read < file_size) { + ssize_t n = read(fd, source->origin.memory + total_read, file_size - total_read); + if (n <= 0) break; + total_read += (size_t)n; + } + close(fd); + source->origin.memory_size = total_read; + xmlTextReader *reader = xmlReaderForMemory(source->origin.memory, + source->origin.memory_size, source->origin.filepath, NULL, 0); + if (reader == NULL) { + oscap_seterr(OSCAP_EFAMILY_XML, "Unable to create streaming xmlTextReader for %s", + oscap_source_readable_origin(source)); + oscap_setxmlerr(xmlGetLastError()); + } + return reader; + } + + oscap_seterr(OSCAP_EFAMILY_XML, "Unable to create streaming xmlTextReader for %s", + oscap_source_readable_origin(source)); + return NULL; +} + oscap_document_type_t oscap_source_get_scap_type(struct oscap_source *source) { if (source->scap_type == OSCAP_DOCUMENT_UNKNOWN) { - xmlTextReader *reader = oscap_source_get_xmlTextReader(source); + xmlTextReader *reader = oscap_source_get_streaming_xmlTextReader(source); if (reader == NULL) { - // the oscap error is already set return OSCAP_DOCUMENT_UNKNOWN; } if (oscap_determine_document_type_reader(reader, &(source->scap_type)) == -1) { oscap_seterr(OSCAP_EFAMILY_XML, "Unknown document type: '%s'", oscap_source_readable_origin(source)); - // in case of error scap_type must remain UNKNOWN assert(source->scap_type == OSCAP_DOCUMENT_UNKNOWN); } xmlFreeTextReader(reader); @@ -385,7 +458,7 @@ int oscap_source_validate_schematron(struct oscap_source *source) const char *oscap_source_get_schema_version(struct oscap_source *source) { if (source->origin.version == NULL) { - xmlTextReader *reader = oscap_source_get_xmlTextReader(source); + xmlTextReader *reader = oscap_source_get_streaming_xmlTextReader(source); if (reader == NULL) { return NULL; } diff --git a/src/source/oscap_source_priv.h b/src/source/oscap_source_priv.h index 4c4aa3d424..7492b96843 100644 --- a/src/source/oscap_source_priv.h +++ b/src/source/oscap_source_priv.h @@ -60,13 +60,30 @@ struct oscap_source *oscap_source_new_from_xmlDoc(xmlDoc *doc, const char *filep /** * Get an xmlTextReader assigned with this resource. The reader needs to be - * disposed by caller. + * disposed by caller. This variant walks over the in-memory DOM (loading + * it first if necessary). * @memberof oscap_source * @param source Resource to read the content * @returns xmlTextReader structure to read the content */ xmlTextReader *oscap_source_get_xmlTextReader(struct oscap_source *source); +/** + * Get a streaming xmlTextReader that does NOT require loading the full DOM + * into memory. For file-based sources, this reads directly from the file. + * For memory-based sources, it parses from the memory buffer. For sources + * that already have a cached DOM, it walks the DOM (same as get_xmlTextReader). + * + * This should be preferred over oscap_source_get_xmlTextReader() when the + * caller only needs sequential read access and does not need the DOM to + * persist after reading. + * + * @memberof oscap_source + * @param source Resource to read the content + * @returns xmlTextReader structure to read the content + */ +xmlTextReader *oscap_source_get_streaming_xmlTextReader(struct oscap_source *source); + /** * Get a DOM representation of this resource. The document ins still owned * by oscap_source. @@ -85,4 +102,13 @@ xmlDoc *oscap_source_get_xmlDoc(struct oscap_source *source); */ xmlDoc *oscap_source_pop_xmlDoc(struct oscap_source *source); +/** + * Release the memory buffer held by this source. After this call, the + * source can no longer be parsed from its raw memory. This is useful + * to reduce memory after the source has been fully consumed. + * @memberof oscap_source + * @param source Resource to release memory buffer from + */ +void oscap_source_free_memory(struct oscap_source *source); + #endif From 992a7a4d29aa9f4df70e7c979b05c45d36b88989 Mon Sep 17 00:00:00 2001 From: David du Colombier Date: Wed, 18 Feb 2026 14:57:59 +0100 Subject: [PATCH 2/4] Use streaming xmlTextReader in all OVAL importers Switch oval_definition_model, oval_syschar_model, oval_variable_model, oval_directives_model, and oval_results_model import functions to use oscap_source_get_streaming_xmlTextReader() instead of oscap_source_get_xmlTextReader(). This avoids loading the full XML DOM into memory when importing OVAL documents, since the OVAL parsers only use streaming-compatible xmlTextReader API calls. --- src/OVAL/oval_defModel.c | 2 +- src/OVAL/oval_directives.c | 2 +- src/OVAL/oval_sysModel.c | 2 +- src/OVAL/oval_varModel.c | 2 +- src/OVAL/results/oval_resModel.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/OVAL/oval_defModel.c b/src/OVAL/oval_defModel.c index 00b9043524..7320faf41d 100644 --- a/src/OVAL/oval_defModel.c +++ b/src/OVAL/oval_defModel.c @@ -226,7 +226,7 @@ static inline int _oval_definition_model_merge_source(struct oval_definition_mod { /* setup context */ struct oval_parser_context context; - context.reader = oscap_source_get_xmlTextReader(source); + context.reader = oscap_source_get_streaming_xmlTextReader(source); if (context.reader == NULL) { return -1; } diff --git a/src/OVAL/oval_directives.c b/src/OVAL/oval_directives.c index 55f110be9d..ffb528c38d 100644 --- a/src/OVAL/oval_directives.c +++ b/src/OVAL/oval_directives.c @@ -111,7 +111,7 @@ int oval_directives_model_import_source(struct oval_directives_model *model, str /* setup context */ struct oval_parser_context context; - context.reader = oscap_source_get_xmlTextReader(source); + context.reader = oscap_source_get_streaming_xmlTextReader(source); if (context.reader == NULL) { return -1; } diff --git a/src/OVAL/oval_sysModel.c b/src/OVAL/oval_sysModel.c index e5a1bb525d..95f858893b 100644 --- a/src/OVAL/oval_sysModel.c +++ b/src/OVAL/oval_sysModel.c @@ -235,7 +235,7 @@ int oval_syschar_model_import_source(struct oval_syschar_model *model, struct os int ret = 0; /* setup context */ struct oval_parser_context context; - context.reader = oscap_source_get_xmlTextReader(source); + context.reader = oscap_source_get_streaming_xmlTextReader(source); if (context.reader == NULL) { return -1; } diff --git a/src/OVAL/oval_varModel.c b/src/OVAL/oval_varModel.c index e9c025bedd..29cef3ac12 100644 --- a/src/OVAL/oval_varModel.c +++ b/src/OVAL/oval_varModel.c @@ -302,7 +302,7 @@ static int _oval_variable_model_parse(struct oval_variable_model *model, xmlText struct oval_variable_model *oval_variable_model_import_source(struct oscap_source *source) { int ret; - xmlTextReader *reader = oscap_source_get_xmlTextReader(source); + xmlTextReader *reader = oscap_source_get_streaming_xmlTextReader(source); if (reader == NULL) { return NULL; } diff --git a/src/OVAL/results/oval_resModel.c b/src/OVAL/results/oval_resModel.c index e8bcf22841..d2339a06f8 100644 --- a/src/OVAL/results/oval_resModel.c +++ b/src/OVAL/results/oval_resModel.c @@ -200,7 +200,7 @@ int oval_results_model_import_source(struct oval_results_model *model, struct os /* setup context */ struct oval_parser_context context; - context.reader = oscap_source_get_xmlTextReader(source); + context.reader = oscap_source_get_streaming_xmlTextReader(source); if (context.reader == NULL) { return -1; } From ad842e97810794dc1f84ef9c86ea0ed33a583e39 Mon Sep 17 00:00:00 2001 From: David du Colombier Date: Wed, 18 Feb 2026 14:58:00 +0100 Subject: [PATCH 3/4] Serialize extracted SDS components to memory buffers Instead of keeping cloned DOM trees for extracted DataStream components, serialize them to compact XML text buffers via xmlDocDumpMemory() and immediately free the cloned DOM. The component oscap_source is then created from the memory buffer using oscap_source_new_take_memory(). This reduces peak memory during SDS decomposition because serialized XML text is typically 3-5x smaller than its libxml2 DOM representation. The streaming xmlTextReader can also parse directly from these buffers without constructing an intermediate DOM. --- src/DS/sds.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/DS/sds.c b/src/DS/sds.c index 15f5079c39..10083b0d54 100644 --- a/src/DS/sds.c +++ b/src/DS/sds.c @@ -244,12 +244,30 @@ static int ds_sds_register_xmlDoc(struct ds_sds_session *session, xmlDoc* doc, x return -1; } - struct oscap_source *component_source = oscap_source_new_from_xmlDoc(new_doc, relative_filepath); + xmlChar *xml_buf = NULL; + int buf_size = 0; + xmlDocDumpMemory(new_doc, &xml_buf, &buf_size); + xmlFreeDoc(new_doc); + if (xml_buf == NULL || buf_size <= 0) { + oscap_seterr(OSCAP_EFAMILY_XML, "Failed to serialize extracted component '%s'", relative_filepath); + xmlFree(xml_buf); + return -1; + } + + char *buf = malloc((size_t)buf_size); + if (buf == NULL) { + xmlFree(xml_buf); + return -1; + } + memcpy(buf, xml_buf, (size_t)buf_size); + xmlFree(xml_buf); + + struct oscap_source *component_source = oscap_source_new_take_memory(buf, (size_t)buf_size, relative_filepath); if (ds_sds_session_register_component_source(session, relative_filepath, component_source) != 0) { oscap_source_free(component_source); } - return 0; // TODO: Return value of ds_sds_session_register_component_source(). (commit message) + return 0; } static int ds_sds_register_component(struct ds_sds_session *session, xmlDoc* doc, xmlNodePtr component_inner_root, const char* component_id, const char* target_filename_dirname, const char* relative_filepath) From 27ba70ca2b22ed646894447d7b9e4ce0d4da2d10 Mon Sep 17 00:00:00 2001 From: David du Colombier Date: Wed, 18 Feb 2026 14:58:01 +0100 Subject: [PATCH 4/4] Free source XML DOMs immediately after model import Release the xmlDoc held by OVAL and XCCDF sources as soon as the corresponding object models have been built from them. In xccdf_session_load_oval(), call oscap_source_free_xmlDoc() on each OVAL source right after oval_definition_model_import_source(). In _xccdf_session_load_xccdf_benchmark(), free the XCCDF source DOM right after xccdf_benchmark_import_source(). This eliminates the window where both the XML DOM and the parsed object model coexist in memory during the loading phase. --- src/XCCDF/xccdf_session.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/XCCDF/xccdf_session.c b/src/XCCDF/xccdf_session.c index 5466402cae..da2e0e5bbf 100644 --- a/src/XCCDF/xccdf_session.c +++ b/src/XCCDF/xccdf_session.c @@ -33,6 +33,7 @@ #include #include "oscap_source.h" +#include "source/oscap_source_priv.h" #include #include #include @@ -792,6 +793,7 @@ static inline int _xccdf_session_load_xccdf_benchmark(struct xccdf_session *sess if (benchmark == NULL) { return 1; } + oscap_source_free_xmlDoc(session->xccdf.source); /* create the policy model */ session->xccdf.policy_model = xccdf_policy_model_new(benchmark); @@ -1216,6 +1218,7 @@ int xccdf_session_load_oval(struct xccdf_session *session) oscap_source_readable_origin(contents[idx]->source)); return 1; } + oscap_source_free_xmlDoc(contents[idx]->source); /* def_model -> session */ struct oval_agent_session *tmp_sess = oval_agent_new_session(tmp_def_model, contents[idx]->href);