From 0efcc223cc3b8d782dd750f5bbf1e1ff7c49161f Mon Sep 17 00:00:00 2001 From: Vagisha Sharma Date: Tue, 7 Apr 2026 19:20:10 -0700 Subject: [PATCH] - Fixed DOCTYPE-disallowed regression in NCBI taxonomy parsing. NcbiUtils.getScientificNames threw PxException on NCBI eSummary lookup because the response begins with a declaration that XmlBeansUtil.DOCUMENT_BUILDER_FACTORY rejects. Switch to the new DOCUMENT_BUILDER_FACTORY_ALLOWING_DOCTYPE. - Added unit test --- .../panoramapublic/PanoramaPublicModule.java | 2 + .../proteomexchange/NcbiUtils.java | 162 ++++++++++++++---- 2 files changed, 129 insertions(+), 35 deletions(-) diff --git a/panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicModule.java b/panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicModule.java index 248054b0..be3507bd 100644 --- a/panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicModule.java +++ b/panoramapublic/src/org/labkey/panoramapublic/PanoramaPublicModule.java @@ -54,6 +54,7 @@ import org.labkey.panoramapublic.pipeline.PxValidationPipelineProvider; import org.labkey.panoramapublic.proteomexchange.ExperimentModificationGetter; import org.labkey.panoramapublic.proteomexchange.Formula; +import org.labkey.panoramapublic.proteomexchange.NcbiUtils; import org.labkey.panoramapublic.proteomexchange.SkylineVersion; import org.labkey.panoramapublic.proteomexchange.UnimodUtil; import org.labkey.panoramapublic.proteomexchange.validator.SkylineDocValidator; @@ -384,6 +385,7 @@ public Set getSchemaNames() set.add(BlueskyApiClient.TestCase.class); set.add(PrivateDataReminderSettings.TestCase.class); set.add(NcbiPublicationSearchServiceImpl.TestCase.class); + set.add(NcbiUtils.TestCase.class); return set; } diff --git a/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/NcbiUtils.java b/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/NcbiUtils.java index 13be9cf8..2a8efd6b 100644 --- a/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/NcbiUtils.java +++ b/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/NcbiUtils.java @@ -17,6 +17,8 @@ import org.apache.commons.lang3.StringUtils; import org.json.JSONObject; +import org.junit.Assert; +import org.junit.Test; import org.labkey.api.collections.IntHashMap; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.util.XmlBeansUtil; @@ -32,7 +34,9 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.ParserConfigurationException; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; @@ -117,8 +121,6 @@ public static Map getScientificNames(List taxIds) thro { String queryUrl = eutilsUrl + "&id=" + StringUtils.join(taxIds, ","); - Map sciNameMap = new IntHashMap<>(); - HttpURLConnection conn = null; try { @@ -130,50 +132,140 @@ public static Map getScientificNames(List taxIds) thro if (status == HttpURLConnection.HTTP_OK) { - DocumentBuilder builder = XmlBeansUtil.DOCUMENT_BUILDER_FACTORY.newDocumentBuilder(); - Document doc = builder.parse(conn.getInputStream()); + return parseScientificNames(conn.getInputStream()); + } + return new IntHashMap<>(); + } + catch (IOException | SAXException | ParserConfigurationException e) + { + throw new PxException("Error doing NCBI lookup for scientific names.", e); + } + finally + { + if(conn != null) conn.disconnect(); + } + } + + /** + * Returns the {@link DocumentBuilder} used to parse NCBI eSummary responses. NCBI's response + * begins with a {@code } declaration, so + * we use the {@code _ALLOWING_DOCTYPE} variant which permits the DOCTYPE but keeps every + * other XXE-mitigation in place. + */ + static DocumentBuilder getDocumentBuilder() throws ParserConfigurationException + { + return XmlBeansUtil.DOCUMENT_BUILDER_FACTORY_ALLOWING_DOCTYPE.newDocumentBuilder(); + } - NodeList nodes = doc.getElementsByTagName("DocSum"); - for(int i = 0; i < nodes.getLength(); i++) + // Parses an NCBI eSummary taxonomy XML response and returns a map of taxid -> scientific name. + private static Map parseScientificNames(InputStream in) + throws ParserConfigurationException, SAXException, IOException + { + Document doc = getDocumentBuilder().parse(in); + + Map sciNameMap = new IntHashMap<>(); + NodeList nodes = doc.getElementsByTagName("DocSum"); + for(int i = 0; i < nodes.getLength(); i++) + { + Element node = (Element)nodes.item(i); + Node idNode = node.getElementsByTagName("Id").item(0); + String taxidStr = null; + if(idNode != null) + { + Node taxidNode = idNode.getFirstChild(); + if (taxidNode instanceof CharacterData) { - Element node = (Element)nodes.item(i); - Node idNode = node.getElementsByTagName("Id").item(0); - String taxidStr = null; - if(idNode != null) - { - Node taxidNode = idNode.getFirstChild(); - if (taxidNode instanceof CharacterData) - { - taxidStr = ((CharacterData) taxidNode).getData(); - } - } + taxidStr = ((CharacterData) taxidNode).getData(); + } + } - NodeList children = node.getElementsByTagName("Item"); - for(int j = 0; j < children.getLength(); j++) + NodeList children = node.getElementsByTagName("Item"); + for(int j = 0; j < children.getLength(); j++) + { + Element child = (Element)children.item(j); + if(!StringUtils.isBlank(taxidStr) && ("ScientificName").equalsIgnoreCase(child.getAttribute("Name"))) + { + Node sciName = child.getFirstChild(); + if(sciName instanceof CharacterData) { - Element child = (Element)children.item(j); - if(!StringUtils.isBlank(taxidStr) && ("ScientificName").equalsIgnoreCase(child.getAttribute("Name"))) - { - Node sciName = child.getFirstChild(); - if(sciName instanceof CharacterData) - { - Integer taxid = Integer.parseInt(taxidStr); - sciNameMap.put(taxid, ((CharacterData) sciName).getData()); - break; - } - } + Integer taxid = Integer.parseInt(taxidStr); + sciNameMap.put(taxid, ((CharacterData) sciName).getData()); + break; } } } } - catch (IOException | SAXException | ParserConfigurationException e) + return sciNameMap; + } + + public static class TestCase extends Assert + { + // NCBI esummary taxonomy response captured from + // https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id=9606,10090,4932 + private static final String ESUMMARY_TAXONOMY_RESPONSE = + "\n" + + "\n" + + "\n" + + "\n" + + " 9606\n" + + " active\n" + + " species\n" + + " primates\n" + + " Homo sapiens\n" + + " human\n" + + " 9606\n" + + " 0\n" + + " \n" + + " \n" + + " \n" + + " 2024/09/10 00:00\n" + + "\n" + + "\n" + + "\n" + + " 10090\n" + + " active\n" + + " species\n" + + " rodents\n" + + " Mus musculus\n" + + " house mouse\n" + + " 10090\n" + + " 0\n" + + " \n" + + " \n" + + " \n" + + " 2025/06/16 00:00\n" + + "\n" + + "\n" + + "\n" + + " 4932\n" + + " active\n" + + " species\n" + + " budding yeasts & allies\n" + + " Saccharomyces cerevisiae\n" + + " brewer's yeast\n" + + " 4932\n" + + " 0\n" + + " \n" + + " \n" + + " \n" + + " 2025/08/11 00:00\n" + + "\n" + + "\n" + + "\n"; + + @Test + public void testParseScientificNames() throws Exception { - throw new PxException("Error doing NCBI lookup for scientific names.", e); + Map names = parseScientificNames(toStream(ESUMMARY_TAXONOMY_RESPONSE)); + assertEquals(3, names.size()); + assertEquals("Homo sapiens", names.get(9606)); + assertEquals("Mus musculus", names.get(10090)); + assertEquals("Saccharomyces cerevisiae", names.get(4932)); } - finally + + private static InputStream toStream(String xml) { - if(conn != null) conn.disconnect(); + return new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)); } - return sciNameMap; } }