From 1bafae4f11356467163ea98c984e381a1d9d655c Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 4 Nov 2025 15:07:20 -0500 Subject: [PATCH 1/5] feat(curation): Add functionality to filter results by curation --- NAMESPACE | 2 ++ R/getSubnetworkFromIndra.R | 7 ++-- R/utils_getSubnetworkFromIndra.R | 57 ++++++++++++++++++++++++++++---- man/getSubnetworkFromIndra.Rd | 6 +++- 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e84ce42..94b06a0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,9 +18,11 @@ importFrom(RCy3,mapVisualProperty) importFrom(RCy3,setVisualStyle) importFrom(grDevices,colorRamp) importFrom(grDevices,rgb) +importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) importFrom(httr,content) +importFrom(httr,status_code) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) importFrom(r2r,hashmap) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index d0d2f39..55697b2 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -30,6 +30,8 @@ #' @param force_include_other character vector of identifiers to include in the #' network, regardless if those ids are in the input data. Should be formatted #' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911". +#' @param filter_by_curation logical, whether to filter out statements that +#' have been curated as incorrect in INDRA. Default is FALSE. #' #' @return list of 2 data.frames, nodes and edges #' @@ -53,11 +55,12 @@ getSubnetworkFromIndra <- function(input, correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_other = NULL) { + force_include_other = NULL, + filter_by_curation = FALSE) { input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other) .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other) res <- .callIndraCogexApi(input$HgncId, force_include_other) - res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter) + res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation) edges <- .constructEdgesDataFrame(res, input, protein_level_data) edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff) nodes <- .constructNodesDataFrame(input, edges) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 4d3071c..2450fae 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -66,25 +66,51 @@ return(res) } +#' @importFrom httr GET status_code content +.get_incorrect_curation_count <- function(stmt_hash) { + stmt_hash_char <- as.character(stmt_hash) + url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char) + + tryCatch({ + response <- GET(url) + if (status_code(response) == 200) { + curations <- fromJSON(content(response, "text", encoding = "UTF-8")) + if (length(curations) == 0) { + return(0) + } + incorrect_curations <- curations[curations$tag != "correct", ] + unique_incorrect <- length(unique(incorrect_curations$source_hash)) + + return(unique_incorrect) + } else { + warning(paste("API request failed for hash", stmt_hash_char, + "with status code", status_code(response))) + return(0) + } + }, error = function(e) { + warning(paste("Error processing hash", stmt_hash_char, ":", e$message)) + return(0) + }) +} + #' Call INDRA Cogex API and return response #' @param res response from INDRA #' @param interaction_types interaction types to filter by #' @param evidence_count_cutoff number of evidence to filter on for each paper #' @param sources_filter list of sources to filter by. Default is NULL, i.e. no filter +#' @param filter_by_curation logical, whether to filter out statements that +#' have been curated as incorrect in INDRA. Default is FALSE. #' @return filtered list of INDRA statements #' @importFrom jsonlite fromJSON #' @keywords internal #' @noRd -.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, sources_filter = NULL) { +.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, + sources_filter = NULL, filter_by_curation = FALSE) { if (!is.null(interaction_types)) { res = Filter( function(statement) statement$data$stmt_type %in% interaction_types, res) } - res = Filter( - function(statement) statement$data$evidence_count >= evidence_count_cutoff, - res - ) if (!is.null(sources_filter)) { res = Filter( function(statement) { @@ -95,6 +121,18 @@ res ) } + if (filter_by_curation) { + for (i in seq_along(res)) { + stmt_hash <- res[[i]]$data$stmt_hash + incorrect_count <- .get_incorrect_curation_count(stmt_hash) + res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count + Sys.sleep(0.1) + } + } + res = Filter( + function(statement) statement$data$evidence_count >= evidence_count_cutoff, + res + ) return(res) } @@ -264,6 +302,9 @@ sourceCounts = vapply(keys(res), function(x) { query(res, x)$data$source_counts }, ""), + stmtHash = vapply(keys(res), function(x) { + as.character(query(res, x)$data$stmt_hash) + }, ""), stringsAsFactors = FALSE ) # add correlation - maybe create a separate function @@ -321,7 +362,8 @@ #' @noRd .filterEdgesDataFrame <- function(edges, paper_count_cutoff, - correlation_cutoff) { + correlation_cutoff, + filter_by_curation) { edges <- edges[which(edges$paperCount >= paper_count_cutoff), ] if ("correlation" %in% colnames(edges)) { edges <- edges[which(abs(edges$correlation) >= correlation_cutoff), ] @@ -329,6 +371,9 @@ if (nrow(edges) == 0) { stop("No edges remain after applying filters. Consider relaxing filters") } + if (filter_by_curation) { + # count number of evidences that are curated as incorrect and subtract number from evidence count + } return(edges) } diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index ac8fa6f..8722e39 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -14,7 +14,8 @@ getSubnetworkFromIndra( correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_other = NULL + force_include_other = NULL, + filter_by_curation = FALSE ) } \arguments{ @@ -54,6 +55,9 @@ is NULL, i.e. no logFC filtering.} \item{force_include_other}{character vector of identifiers to include in the network, regardless if those ids are in the input data. Should be formatted as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".} + +\item{filter_by_curation}{logical, whether to filter out statements that +have been curated as incorrect in INDRA. Default is FALSE.} } \value{ list of 2 data.frames, nodes and edges From 4ffaab4d32746b49c214bab76ae6fecafdc7dbd1 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 4 Nov 2025 16:55:32 -0500 Subject: [PATCH 2/5] update to enable user to enter api key --- R/getSubnetworkFromIndra.R | 6 ++++-- R/utils_getSubnetworkFromIndra.R | 15 ++++++--------- man/getSubnetworkFromIndra.Rd | 5 ++++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index 55697b2..ab3e232 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -32,6 +32,7 @@ #' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911". #' @param filter_by_curation logical, whether to filter out statements that #' have been curated as incorrect in INDRA. Default is FALSE. +#' @param api_key string of INDRA API key for accessing curated statements. #' #' @return list of 2 data.frames, nodes and edges #' @@ -56,11 +57,12 @@ getSubnetworkFromIndra <- function(input, sources_filter = NULL, logfc_cutoff = NULL, force_include_other = NULL, - filter_by_curation = FALSE) { + filter_by_curation = FALSE, + api_key = "") { input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other) .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other) res <- .callIndraCogexApi(input$HgncId, force_include_other) - res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation) + res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation, api_key) edges <- .constructEdgesDataFrame(res, input, protein_level_data) edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff) nodes <- .constructNodesDataFrame(input, edges) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 2450fae..ca285da 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -67,9 +67,9 @@ } #' @importFrom httr GET status_code content -.get_incorrect_curation_count <- function(stmt_hash) { +.get_incorrect_curation_count <- function(stmt_hash, api_key) { stmt_hash_char <- as.character(stmt_hash) - url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char) + url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char, "?api_key=", api_key) tryCatch({ response <- GET(url) @@ -100,12 +100,13 @@ #' @param sources_filter list of sources to filter by. Default is NULL, i.e. no filter #' @param filter_by_curation logical, whether to filter out statements that #' have been curated as incorrect in INDRA. Default is FALSE. +#' @param api_key string of INDRA API key for accessing curated statements. #' @return filtered list of INDRA statements #' @importFrom jsonlite fromJSON #' @keywords internal #' @noRd .filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, - sources_filter = NULL, filter_by_curation = FALSE) { + sources_filter = NULL, filter_by_curation = FALSE, api_key = "") { if (!is.null(interaction_types)) { res = Filter( function(statement) statement$data$stmt_type %in% interaction_types, @@ -124,7 +125,7 @@ if (filter_by_curation) { for (i in seq_along(res)) { stmt_hash <- res[[i]]$data$stmt_hash - incorrect_count <- .get_incorrect_curation_count(stmt_hash) + incorrect_count <- .get_incorrect_curation_count(stmt_hash, api_key) res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count Sys.sleep(0.1) } @@ -362,8 +363,7 @@ #' @noRd .filterEdgesDataFrame <- function(edges, paper_count_cutoff, - correlation_cutoff, - filter_by_curation) { + correlation_cutoff) { edges <- edges[which(edges$paperCount >= paper_count_cutoff), ] if ("correlation" %in% colnames(edges)) { edges <- edges[which(abs(edges$correlation) >= correlation_cutoff), ] @@ -371,9 +371,6 @@ if (nrow(edges) == 0) { stop("No edges remain after applying filters. Consider relaxing filters") } - if (filter_by_curation) { - # count number of evidences that are curated as incorrect and subtract number from evidence count - } return(edges) } diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index 8722e39..94d27f9 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -15,7 +15,8 @@ getSubnetworkFromIndra( sources_filter = NULL, logfc_cutoff = NULL, force_include_other = NULL, - filter_by_curation = FALSE + filter_by_curation = FALSE, + api_key = "" ) } \arguments{ @@ -58,6 +59,8 @@ as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".} \item{filter_by_curation}{logical, whether to filter out statements that have been curated as incorrect in INDRA. Default is FALSE.} + +\item{api_key}{string of INDRA API key for accessing curated statements.} } \value{ list of 2 data.frames, nodes and edges From ba23e9e03cd0f2799f7476354428d854f5fcfc47 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 4 Nov 2025 17:17:00 -0500 Subject: [PATCH 3/5] remove stmt hash from edge table --- R/utils_getSubnetworkFromIndra.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index ca285da..d3251eb 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -303,9 +303,6 @@ sourceCounts = vapply(keys(res), function(x) { query(res, x)$data$source_counts }, ""), - stmtHash = vapply(keys(res), function(x) { - as.character(query(res, x)$data$stmt_hash) - }, ""), stringsAsFactors = FALSE ) # add correlation - maybe create a separate function From 4271be891cf1e6f8d29deeca927f8c4029287d9c Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 4 Nov 2025 17:21:29 -0500 Subject: [PATCH 4/5] add todo for the future --- R/utils_getSubnetworkFromIndra.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index d3251eb..0ae9210 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -127,6 +127,7 @@ stmt_hash <- res[[i]]$data$stmt_hash incorrect_count <- .get_incorrect_curation_count(stmt_hash, api_key) res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count + # Todo: Also subtract source_counts accordingly if requested Sys.sleep(0.1) } } From 0503630991a90d722571dea00f82215d436c3dc5 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 4 Nov 2025 17:58:48 -0500 Subject: [PATCH 5/5] add fromJSON dependency for function --- R/utils_getSubnetworkFromIndra.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 0ae9210..911c863 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -67,6 +67,7 @@ } #' @importFrom httr GET status_code content +#' @importFrom jsonlite fromJSON .get_incorrect_curation_count <- function(stmt_hash, api_key) { stmt_hash_char <- as.character(stmt_hash) url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char, "?api_key=", api_key)