From 1bafae4f11356467163ea98c984e381a1d9d655c Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Tue, 4 Nov 2025 15:07:20 -0500
Subject: [PATCH 1/5] feat(curation): Add functionality to filter results by
 curation

---
 NAMESPACE                        |  2 ++
 R/getSubnetworkFromIndra.R       |  7 ++--
 R/utils_getSubnetworkFromIndra.R | 57 ++++++++++++++++++++++++++++----
 man/getSubnetworkFromIndra.Rd    |  6 +++-
 4 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index e84ce42..94b06a0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -18,9 +18,11 @@ importFrom(RCy3,mapVisualProperty)
 importFrom(RCy3,setVisualStyle)
 importFrom(grDevices,colorRamp)
 importFrom(grDevices,rgb)
+importFrom(httr,GET)
 importFrom(httr,POST)
 importFrom(httr,add_headers)
 importFrom(httr,content)
+importFrom(httr,status_code)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)
 importFrom(r2r,hashmap)
diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R
index d0d2f39..55697b2 100644
--- a/R/getSubnetworkFromIndra.R
+++ b/R/getSubnetworkFromIndra.R
@@ -30,6 +30,8 @@
 #' @param force_include_other character vector of identifiers to include in the
 #' network, regardless if those ids are in the input data. Should be formatted
 #' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".
+#' @param filter_by_curation logical, whether to filter out statements that
+#' have been curated as incorrect in INDRA.  Default is FALSE.
 #'
 #' @return list of 2 data.frames, nodes and edges
 #'
@@ -53,11 +55,12 @@ getSubnetworkFromIndra <- function(input,
                                    correlation_cutoff = 0.3,
                                    sources_filter = NULL,
                                    logfc_cutoff = NULL,
-                                   force_include_other = NULL) {
+                                   force_include_other = NULL, 
+                                   filter_by_curation = FALSE) {
     input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other)
     .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other)
     res <- .callIndraCogexApi(input$HgncId, force_include_other)
-    res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter)
+    res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation)
     edges <- .constructEdgesDataFrame(res, input, protein_level_data)
     edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff)
     nodes <- .constructNodesDataFrame(input, edges)
diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R
index 4d3071c..2450fae 100644
--- a/R/utils_getSubnetworkFromIndra.R
+++ b/R/utils_getSubnetworkFromIndra.R
@@ -66,25 +66,51 @@
     return(res)
 }
 
+#' @importFrom httr GET status_code content
+.get_incorrect_curation_count <- function(stmt_hash) {
+    stmt_hash_char <- as.character(stmt_hash)
+    url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char)
+
+    tryCatch({
+        response <- GET(url)
+        if (status_code(response) == 200) {
+            curations <- fromJSON(content(response, "text", encoding = "UTF-8"))
+            if (length(curations) == 0) {
+                return(0)
+            }
+            incorrect_curations <- curations[curations$tag != "correct", ]
+            unique_incorrect <- length(unique(incorrect_curations$source_hash))
+            
+            return(unique_incorrect)
+        } else {
+            warning(paste("API request failed for hash", stmt_hash_char, 
+                          "with status code", status_code(response)))
+            return(0)
+        }
+    }, error = function(e) {
+        warning(paste("Error processing hash", stmt_hash_char, ":", e$message))
+        return(0)
+    })
+}
+
 #' Call INDRA Cogex API and return response
 #' @param res response from INDRA
 #' @param interaction_types interaction types to filter by
 #' @param evidence_count_cutoff number of evidence to filter on for each paper
 #' @param sources_filter list of sources to filter by. Default is NULL, i.e. no filter
+#' @param filter_by_curation logical, whether to filter out statements that
+#' have been curated as incorrect in INDRA.  Default is FALSE.
 #' @return filtered list of INDRA statements
 #' @importFrom jsonlite fromJSON
 #' @keywords internal
 #' @noRd
-.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, sources_filter = NULL) {
+.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, 
+                                 sources_filter = NULL, filter_by_curation = FALSE) {
     if (!is.null(interaction_types)) {
         res = Filter(
             function(statement) statement$data$stmt_type %in% interaction_types, 
             res)
     }
-    res = Filter(
-        function(statement) statement$data$evidence_count >= evidence_count_cutoff, 
-        res
-    )
     if (!is.null(sources_filter)) {
         res = Filter(
             function(statement) {
@@ -95,6 +121,18 @@
             res
         )
     }
+    if (filter_by_curation) {
+        for (i in seq_along(res)) {
+            stmt_hash <- res[[i]]$data$stmt_hash
+            incorrect_count <- .get_incorrect_curation_count(stmt_hash)
+            res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count
+            Sys.sleep(0.1)
+        }
+    }
+    res = Filter(
+        function(statement) statement$data$evidence_count >= evidence_count_cutoff, 
+        res
+    )
     return(res)
 }
 
@@ -264,6 +302,9 @@
         sourceCounts = vapply(keys(res), function(x) {
             query(res, x)$data$source_counts
         }, ""),
+        stmtHash = vapply(keys(res), function(x) {
+            as.character(query(res, x)$data$stmt_hash)
+        }, ""),
         stringsAsFactors = FALSE
     )
     # add correlation - maybe create a separate function
@@ -321,7 +362,8 @@
 #' @noRd
 .filterEdgesDataFrame <- function(edges, 
                                   paper_count_cutoff,
-                                  correlation_cutoff) {
+                                  correlation_cutoff,
+                                  filter_by_curation) {
     edges <- edges[which(edges$paperCount >= paper_count_cutoff), ]
     if ("correlation" %in% colnames(edges)) {
         edges <- edges[which(abs(edges$correlation) >= correlation_cutoff), ]
@@ -329,6 +371,9 @@
     if (nrow(edges) == 0) {
         stop("No edges remain after applying filters. Consider relaxing filters")
     }
+    if (filter_by_curation) {
+        # count number of evidences that are curated as incorrect and subtract number from evidence count
+    }
     return(edges)
 }
 
diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd
index ac8fa6f..8722e39 100644
--- a/man/getSubnetworkFromIndra.Rd
+++ b/man/getSubnetworkFromIndra.Rd
@@ -14,7 +14,8 @@ getSubnetworkFromIndra(
   correlation_cutoff = 0.3,
   sources_filter = NULL,
   logfc_cutoff = NULL,
-  force_include_other = NULL
+  force_include_other = NULL,
+  filter_by_curation = FALSE
 )
 }
 \arguments{
@@ -54,6 +55,9 @@ is NULL, i.e. no logFC filtering.}
 \item{force_include_other}{character vector of identifiers to include in the
 network, regardless if those ids are in the input data. Should be formatted
 as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".}
+
+\item{filter_by_curation}{logical, whether to filter out statements that
+have been curated as incorrect in INDRA.  Default is FALSE.}
 }
 \value{
 list of 2 data.frames, nodes and edges

From 4ffaab4d32746b49c214bab76ae6fecafdc7dbd1 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Tue, 4 Nov 2025 16:55:32 -0500
Subject: [PATCH 2/5] update to enable user to enter api key

---
 R/getSubnetworkFromIndra.R       |  6 ++++--
 R/utils_getSubnetworkFromIndra.R | 15 ++++++---------
 man/getSubnetworkFromIndra.Rd    |  5 ++++-
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R
index 55697b2..ab3e232 100644
--- a/R/getSubnetworkFromIndra.R
+++ b/R/getSubnetworkFromIndra.R
@@ -32,6 +32,7 @@
 #' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".
 #' @param filter_by_curation logical, whether to filter out statements that
 #' have been curated as incorrect in INDRA.  Default is FALSE.
+#' @param api_key string of INDRA API key for accessing curated statements.
 #'
 #' @return list of 2 data.frames, nodes and edges
 #'
@@ -56,11 +57,12 @@ getSubnetworkFromIndra <- function(input,
                                    sources_filter = NULL,
                                    logfc_cutoff = NULL,
                                    force_include_other = NULL, 
-                                   filter_by_curation = FALSE) {
+                                   filter_by_curation = FALSE, 
+                                   api_key = "") {
     input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other)
     .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other)
     res <- .callIndraCogexApi(input$HgncId, force_include_other)
-    res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation)
+    res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation, api_key)
     edges <- .constructEdgesDataFrame(res, input, protein_level_data)
     edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff)
     nodes <- .constructNodesDataFrame(input, edges)
diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R
index 2450fae..ca285da 100644
--- a/R/utils_getSubnetworkFromIndra.R
+++ b/R/utils_getSubnetworkFromIndra.R
@@ -67,9 +67,9 @@
 }
 
 #' @importFrom httr GET status_code content
-.get_incorrect_curation_count <- function(stmt_hash) {
+.get_incorrect_curation_count <- function(stmt_hash, api_key) {
     stmt_hash_char <- as.character(stmt_hash)
-    url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char)
+    url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char, "?api_key=", api_key)
 
     tryCatch({
         response <- GET(url)
@@ -100,12 +100,13 @@
 #' @param sources_filter list of sources to filter by. Default is NULL, i.e. no filter
 #' @param filter_by_curation logical, whether to filter out statements that
 #' have been curated as incorrect in INDRA.  Default is FALSE.
+#' @param api_key string of INDRA API key for accessing curated statements.
 #' @return filtered list of INDRA statements
 #' @importFrom jsonlite fromJSON
 #' @keywords internal
 #' @noRd
 .filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, 
-                                 sources_filter = NULL, filter_by_curation = FALSE) {
+                                 sources_filter = NULL, filter_by_curation = FALSE, api_key = "") {
     if (!is.null(interaction_types)) {
         res = Filter(
             function(statement) statement$data$stmt_type %in% interaction_types, 
@@ -124,7 +125,7 @@
     if (filter_by_curation) {
         for (i in seq_along(res)) {
             stmt_hash <- res[[i]]$data$stmt_hash
-            incorrect_count <- .get_incorrect_curation_count(stmt_hash)
+            incorrect_count <- .get_incorrect_curation_count(stmt_hash, api_key)
             res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count
             Sys.sleep(0.1)
         }
@@ -362,8 +363,7 @@
 #' @noRd
 .filterEdgesDataFrame <- function(edges, 
                                   paper_count_cutoff,
-                                  correlation_cutoff,
-                                  filter_by_curation) {
+                                  correlation_cutoff) {
     edges <- edges[which(edges$paperCount >= paper_count_cutoff), ]
     if ("correlation" %in% colnames(edges)) {
         edges <- edges[which(abs(edges$correlation) >= correlation_cutoff), ]
@@ -371,9 +371,6 @@
     if (nrow(edges) == 0) {
         stop("No edges remain after applying filters. Consider relaxing filters")
     }
-    if (filter_by_curation) {
-        # count number of evidences that are curated as incorrect and subtract number from evidence count
-    }
     return(edges)
 }
 
diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd
index 8722e39..94d27f9 100644
--- a/man/getSubnetworkFromIndra.Rd
+++ b/man/getSubnetworkFromIndra.Rd
@@ -15,7 +15,8 @@ getSubnetworkFromIndra(
   sources_filter = NULL,
   logfc_cutoff = NULL,
   force_include_other = NULL,
-  filter_by_curation = FALSE
+  filter_by_curation = FALSE,
+  api_key = ""
 )
 }
 \arguments{
@@ -58,6 +59,8 @@ as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".}
 
 \item{filter_by_curation}{logical, whether to filter out statements that
 have been curated as incorrect in INDRA.  Default is FALSE.}
+
+\item{api_key}{string of INDRA API key for accessing curated statements.}
 }
 \value{
 list of 2 data.frames, nodes and edges

From ba23e9e03cd0f2799f7476354428d854f5fcfc47 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Tue, 4 Nov 2025 17:17:00 -0500
Subject: [PATCH 3/5] remove stmt hash from edge table

---
 R/utils_getSubnetworkFromIndra.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R
index ca285da..d3251eb 100644
--- a/R/utils_getSubnetworkFromIndra.R
+++ b/R/utils_getSubnetworkFromIndra.R
@@ -303,9 +303,6 @@
         sourceCounts = vapply(keys(res), function(x) {
             query(res, x)$data$source_counts
         }, ""),
-        stmtHash = vapply(keys(res), function(x) {
-            as.character(query(res, x)$data$stmt_hash)
-        }, ""),
         stringsAsFactors = FALSE
     )
     # add correlation - maybe create a separate function

From 4271be891cf1e6f8d29deeca927f8c4029287d9c Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Tue, 4 Nov 2025 17:21:29 -0500
Subject: [PATCH 4/5] add todo for the future

---
 R/utils_getSubnetworkFromIndra.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R
index d3251eb..0ae9210 100644
--- a/R/utils_getSubnetworkFromIndra.R
+++ b/R/utils_getSubnetworkFromIndra.R
@@ -127,6 +127,7 @@
             stmt_hash <- res[[i]]$data$stmt_hash
             incorrect_count <- .get_incorrect_curation_count(stmt_hash, api_key)
             res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count
+            # Todo: Also subtract source_counts accordingly if requested
             Sys.sleep(0.1)
         }
     }

From 0503630991a90d722571dea00f82215d436c3dc5 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Tue, 4 Nov 2025 17:58:48 -0500
Subject: [PATCH 5/5] add fromJSON dependency for function

---
 R/utils_getSubnetworkFromIndra.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R
index 0ae9210..911c863 100644
--- a/R/utils_getSubnetworkFromIndra.R
+++ b/R/utils_getSubnetworkFromIndra.R
@@ -67,6 +67,7 @@
 }
 
 #' @importFrom httr GET status_code content
+#' @importFrom jsonlite fromJSON
 .get_incorrect_curation_count <- function(stmt_hash, api_key) {
     stmt_hash_char <- as.character(stmt_hash)
     url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char, "?api_key=", api_key)