From 9c9e516899ebb50868014d7b5a8a6266172f9487 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Tue, 3 Mar 2026 23:25:17 -0500 Subject: [PATCH 1/7] Add assignEcByOrthologs.pl: transitive EC assignment via ortholog groups Uses group co-membership and shared InterPro family profiles to assign EC numbers to unannotated proteins by majority vote within profile clusters. Co-Authored-By: Claude Sonnet 4.6 --- Load/bin/assignEcByOrthologs.pl | 302 ++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100755 Load/bin/assignEcByOrthologs.pl diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl new file mode 100755 index 000000000..bbbc118de --- /dev/null +++ b/Load/bin/assignEcByOrthologs.pl @@ -0,0 +1,302 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use lib "$ENV{GUS_HOME}/lib/perl"; + +use Getopt::Long; +use DBI; +use GUS::Supported::GusConfig; + +my ($gusConfigFile, $output, $threshold, $group_id, $verbose); +$threshold = 0.60; + +&GetOptions( + "gusConfigFile=s" => \$gusConfigFile, + "output=s" => \$output, + "threshold=f" => \$threshold, + "group_id=s" => \$group_id, + "verbose" => \$verbose, +); + +die "Usage: assignEcByOrthologs.pl --output [--gusConfigFile ] [--threshold 0.60] [--group_id ] [--verbose]\n" + unless $output; + +$gusConfigFile ||= "$ENV{GUS_HOME}/config/gus.config"; + +# --- DB connection --- +my $gusconfig = GUS::Supported::GusConfig->new($gusConfigFile); +my $dbh = DBI->connect( + $gusconfig->getDbiDsn(), + $gusconfig->getDatabaseLogin(), + $gusconfig->getDatabasePassword(), + { RaiseError => 1, AutoCommit => 0 } +) or die DBI::errstr; + +# --- Load group members --- +print STDERR "Loading group members...\n" if $verbose; + +my %group_members; # group_id => [ aa_sequence_id, ... ] +my %protein_info; # aa_sequence_id => { source_id, protein_length } + +{ + my $sql = q{ + SELECT og.group_id, + og.aa_sequence_id, + s.source_id, + s.length AS protein_length + FROM apidb.orthologgroupaasequence og + JOIN dots.aasequence s ON s.aa_sequence_id = og.aa_sequence_id + }; + $sql .= " WHERE og.group_id = ?" if $group_id; + + my $sth = $dbh->prepare($sql); + $group_id ? $sth->execute($group_id) : $sth->execute(); + + while (my ($gid, $aaid, $src, $len) = $sth->fetchrow_array()) { + push @{ $group_members{$gid} }, $aaid; + $protein_info{$aaid} = { source_id => $src, protein_length => $len // 0 }; + } + $sth->finish(); +} +print STDERR "Loaded ", scalar(keys %group_members), " groups, ", + scalar(keys %protein_info), " proteins\n" if $verbose; + +# --- Load EC numbers (exclude OrthoMCLDerived) --- +print STDERR "Loading EC annotations...\n" if $verbose; + +my %protein_ec; # aa_sequence_id => [ ec_number, ... ] + +{ + my $sql = q{ + SELECT ae.aa_sequence_id, + ec.ec_number + FROM dots.aasequenceenzymeclass ae + JOIN sres.enzymeclass ec ON ec.enzyme_class_id = ae.enzyme_class_id + WHERE (ae.evidence_code IS NULL OR ae.evidence_code != 'OrthoMCLDerived') + }; + + if ($group_id) { + $sql = q{ + SELECT ae.aa_sequence_id, + ec.ec_number + FROM dots.aasequenceenzymeclass ae + JOIN sres.enzymeclass ec ON ec.enzyme_class_id = ae.enzyme_class_id + JOIN apidb.orthologgroupaasequence og ON og.aa_sequence_id = ae.aa_sequence_id + WHERE (ae.evidence_code IS NULL OR ae.evidence_code != 'OrthoMCLDerived') + AND og.group_id = ? + }; + } + + my $sth = $dbh->prepare($sql); + $group_id ? $sth->execute($group_id) : $sth->execute(); + + while (my ($aaid, $ec) = $sth->fetchrow_array()) { + push @{ $protein_ec{$aaid} }, $ec if exists $protein_info{$aaid}; + } + $sth->finish(); +} +my $ec_count = scalar grep { @{ $protein_ec{$_} } } keys %protein_ec; +print STDERR "Loaded EC annotations for $ec_count proteins\n" if $verbose; + +# --- Load InterPro family IDs --- +print STDERR "Loading InterPro family profiles...\n" if $verbose; + +my %protein_profile; # aa_sequence_id => "IPR...|IPR..." (sorted, pipe-joined) + +{ + my %ipr_sets; # aa_sequence_id => { ipr_id => 1 } + + my $sql = q{ + SELECT ir.protein_source_id, + ir.interpro_family_id + FROM apidb.interproresults ir + WHERE ir.interpro_family_id IS NOT NULL + AND ir.interpro_family_id != '-' + }; + + if ($group_id) { + $sql = q{ + SELECT ir.protein_source_id, + ir.interpro_family_id + FROM apidb.interproresults ir + JOIN dots.aasequence s ON s.source_id = ir.protein_source_id + JOIN apidb.orthologgroupaasequence og ON og.aa_sequence_id = s.aa_sequence_id + WHERE ir.interpro_family_id IS NOT NULL + AND ir.interpro_family_id != '-' + AND og.group_id = ? + }; + } + + my $sth = $dbh->prepare($sql); + $group_id ? $sth->execute($group_id) : $sth->execute(); + + my %src_to_aaid; + for my $aaid (keys %protein_info) { + $src_to_aaid{ $protein_info{$aaid}{source_id} } = $aaid; + } + + while (my ($src, $ipr) = $sth->fetchrow_array()) { + my $aaid = $src_to_aaid{$src}; + next unless defined $aaid; + $ipr_sets{$aaid}{$ipr} = 1; + } + $sth->finish(); + + for my $aaid (keys %ipr_sets) { + $protein_profile{$aaid} = join('|', sort keys %{ $ipr_sets{$aaid} }); + } +} +print STDERR "Loaded InterPro profiles for ", scalar(keys %protein_profile), " proteins\n" + if $verbose; + +$dbh->disconnect(); + +# --- Open output --- +open(my $OUT, '>', $output) or die "Cannot open output '$output': $!\n"; + +print $OUT join("\t", qw( + group_id + aa_sequence_id + source_id + protein_length + assigned_ec_number + is_novel + confidence_score + n_supporting + n_annotated_in_cluster + cluster_size + cluster_profile +)), "\n"; + +# --- Process each group --- +my $groups_processed = 0; +my $rows_written = 0; + +for my $gid (sort keys %group_members) { + my @members = @{ $group_members{$gid} }; + + # Cluster by profile key + my %clusters; # profile_key => [ aa_sequence_id, ... ] + for my $aaid (@members) { + my $key = $protein_profile{$aaid} // ''; + push @{ $clusters{$key} }, $aaid; + } + + for my $profile_key (sort keys %clusters) { + my @cluster = @{ $clusters{$profile_key} }; + + # Collect all ECs from annotated proteins in this cluster + my %protein_ecs_in_cluster; # aaid => [ec, ...] + my @all_cluster_ecs; + + for my $aaid (@cluster) { + if ($protein_ec{$aaid} && @{ $protein_ec{$aaid} }) { + $protein_ecs_in_cluster{$aaid} = $protein_ec{$aaid}; + push @all_cluster_ecs, @{ $protein_ec{$aaid} }; + } + } + + my $n_annotated = scalar keys %protein_ecs_in_cluster; + next if $n_annotated == 0; # nothing to propagate from + + # Count support per EC (post-normalization) + my %ec_support; + for my $aaid (keys %protein_ecs_in_cluster) { + my @p_ecs = normalize_ecs(@{ $protein_ecs_in_cluster{$aaid} }); + my %seen; + for my $ec (@p_ecs) { + next if $seen{$ec}++; + $ec_support{$ec}++; + } + } + + # Majority vote + my @passing_ecs; + for my $ec (sort keys %ec_support) { + my $support = $ec_support{$ec}; + my $score = $support / $n_annotated; + if ($score >= $threshold) { + push @passing_ecs, { ec => $ec, support => $support, score => $score }; + } + } + next unless @passing_ecs; + + my $cluster_size = scalar @cluster; + my $display_profile = $profile_key eq '' ? 'none' : $profile_key; + + for my $aaid (@cluster) { + my $info = $protein_info{$aaid}; + my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$aaid} // [] }; + + for my $pass (@passing_ecs) { + my $ec = $pass->{ec}; + my $support = $pass->{support}; + my $score = $pass->{score}; + my $is_novel = $existing_ecs{$ec} ? 0 : 1; + + printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%s\n", + $gid, + $aaid, + $info->{source_id}, + $info->{protein_length}, + $ec, + $is_novel, + $score, + $support, + $n_annotated, + $cluster_size, + $display_profile; + + $rows_written++; + } + } + } + + $groups_processed++; + if ($verbose && $groups_processed % 10000 == 0) { + print STDERR "Processed $groups_processed groups, $rows_written rows written\n"; + } +} + +close($OUT); + +print STDERR "Done. Processed $groups_processed groups, wrote $rows_written rows to $output\n"; + +# =========================================================================== +# Helpers +# =========================================================================== + +sub normalize_ecs { + my @ecs = @_; + my @result; + for my $ec (@ecs) { + my $subsumed = 0; + for my $other (@ecs) { + next if $other eq $ec; + if (is_more_specific($other, $ec)) { + $subsumed = 1; + last; + } + } + push @result, $ec unless $subsumed; + } + return @result; +} + +sub is_more_specific { + my ($specific, $general) = @_; + my @s = split /\./, $specific; + my @g = split /\./, $general; + return 0 unless @g == 4 && @s == 4; + for my $i (0 .. 3) { + if ($g[$i] eq '-') { + # general is a wildcard here; specific is more specific only if it has a defined value + return ($s[$i] ne '-') ? 1 : 0; + } + return 0 if $s[$i] ne $g[$i]; # mismatch at a defined position + } + # All 4 positions matched exactly — same specificity, not strictly more specific + return 0; +} From 4cccb4ba2ae938e3f45a99ccdd5e391176bcef12 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 4 Mar 2026 00:48:19 -0500 Subject: [PATCH 2/7] Add verbose debug output for skipped EC assignments When --verbose is on, print the reason each EC in a group was not transitively assigned: either subsumed by a more specific EC during hierarchy normalization, or below the majority-vote threshold. Co-Authored-By: Claude Sonnet 4.6 --- Load/bin/assignEcByOrthologs.pl | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl index bbbc118de..5c64c6b04 100755 --- a/Load/bin/assignEcByOrthologs.pl +++ b/Load/bin/assignEcByOrthologs.pl @@ -201,6 +201,9 @@ my $n_annotated = scalar keys %protein_ecs_in_cluster; next if $n_annotated == 0; # nothing to propagate from + my $cluster_size = scalar @cluster; + my $display_profile = $profile_key eq '' ? 'none' : $profile_key; + # Count support per EC (post-normalization) my %ec_support; for my $aaid (keys %protein_ecs_in_cluster) { @@ -212,6 +215,16 @@ } } + if ($verbose) { + # Report any raw ECs that were dropped by hierarchy normalization + my %raw_unique = map { $_ => 1 } @all_cluster_ecs; + for my $ec (sort keys %raw_unique) { + unless (exists $ec_support{$ec}) { + print STDERR " SKIP $gid profile=$display_profile EC=$ec: subsumed by more specific EC in cluster\n"; + } + } + } + # Majority vote my @passing_ecs; for my $ec (sort keys %ec_support) { @@ -219,13 +232,13 @@ my $score = $support / $n_annotated; if ($score >= $threshold) { push @passing_ecs, { ec => $ec, support => $support, score => $score }; + } elsif ($verbose) { + printf STDERR " SKIP $gid profile=$display_profile EC=$ec: score too low (%d/%d = %.2f < %.2f)\n", + $support, $n_annotated, $score, $threshold; } } next unless @passing_ecs; - my $cluster_size = scalar @cluster; - my $display_profile = $profile_key eq '' ? 'none' : $profile_key; - for my $aaid (@cluster) { my $info = $protein_info{$aaid}; my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$aaid} // [] }; From c24246cbc8111500112b407ddc009640c6de60fe Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 4 Mar 2026 12:18:34 -0500 Subject: [PATCH 3/7] add some extra logging per source_id for debug purposes. remove now unused scripots --- Load/bin/assignEcByOrthologs.pl | 86 +- Load/bin/newEcPrediction.pl | 202 ----- Load/bin/orthomclEcPrediction | 1478 ------------------------------- 3 files changed, 84 insertions(+), 1682 deletions(-) delete mode 100644 Load/bin/newEcPrediction.pl delete mode 100644 Load/bin/orthomclEcPrediction diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl index 5c64c6b04..77dc1313b 100755 --- a/Load/bin/assignEcByOrthologs.pl +++ b/Load/bin/assignEcByOrthologs.pl @@ -9,7 +9,7 @@ use DBI; use GUS::Supported::GusConfig; -my ($gusConfigFile, $output, $threshold, $group_id, $verbose); +my ($gusConfigFile, $output, $threshold, $group_id, $source_id, $verbose); $threshold = 0.60; &GetOptions( @@ -17,10 +17,11 @@ "output=s" => \$output, "threshold=f" => \$threshold, "group_id=s" => \$group_id, + "source_id=s" => \$source_id, "verbose" => \$verbose, ); -die "Usage: assignEcByOrthologs.pl --output [--gusConfigFile ] [--threshold 0.60] [--group_id ] [--verbose]\n" +die "Usage: assignEcByOrthologs.pl --output [--gusConfigFile ] [--threshold 0.60] [--group_id ] [--source_id ] [--verbose]\n" unless $output; $gusConfigFile ||= "$ENV{GUS_HOME}/config/gus.config"; @@ -153,6 +154,13 @@ $dbh->disconnect(); +if ($source_id) { + my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } keys %protein_info; + die "ERROR: source_id '$source_id' not found in loaded data. Check the ID or --group_id filter.\n" + unless defined $focus_aaid; + print STDERR "Focusing on source_id=$source_id (aa_sequence_id=$focus_aaid)\n" if $verbose; +} + # --- Open output --- open(my $OUT, '>', $output) or die "Cannot open output '$output': $!\n"; @@ -184,6 +192,25 @@ push @{ $clusters{$key} }, $aaid; } + # Report which cluster the focus protein landed in + if ($verbose && $source_id) { + my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @members; + if (defined $focus_aaid) { + my $profile = $protein_profile{$focus_aaid} // ''; + my $display = $profile eq '' ? 'none (no InterPro family hits)' : $profile; + print STDERR "\n PROTEIN $source_id (aa_sequence_id=$focus_aaid)\n"; + print STDERR " group=$gid\n"; + print STDERR " InterPro profile: $display\n"; + my $own_ecs = @{ $protein_ec{$focus_aaid} // [] } + ? join(', ', @{ $protein_ec{$focus_aaid} }) + : '(none)'; + print STDERR " existing ECs on this protein: $own_ecs\n"; + my $cluster_key = $protein_profile{$focus_aaid} // ''; + my $cluster_size = scalar @{ $clusters{$cluster_key} }; + print STDERR " cluster_size (same profile): $cluster_size\n\n"; + } + } + for my $profile_key (sort keys %clusters) { my @cluster = @{ $clusters{$profile_key} }; @@ -199,6 +226,14 @@ } my $n_annotated = scalar keys %protein_ecs_in_cluster; + + if ($n_annotated == 0 && $verbose && $source_id) { + my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @cluster; + if (defined $focus_aaid) { + my $display = $profile_key eq '' ? 'none' : $profile_key; + print STDERR " PROTEIN $source_id: cluster (profile=$display) has no annotated proteins — nothing to propagate\n\n"; + } + } next if $n_annotated == 0; # nothing to propagate from my $cluster_size = scalar @cluster; @@ -237,6 +272,53 @@ $support, $n_annotated, $score, $threshold; } } + # If we are focusing on a specific protein, print its cluster context + # regardless of whether any ECs pass threshold + if ($verbose && $source_id) { + my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @cluster; + if (defined $focus_aaid) { + print STDERR "\n PROTEIN $source_id (aa_sequence_id=$focus_aaid)\n"; + print STDERR " group=$gid cluster_size=$cluster_size n_annotated=$n_annotated\n"; + print STDERR " profile=$display_profile\n"; + my $own_ecs = @{ $protein_ec{$focus_aaid} // [] } + ? join(', ', @{ $protein_ec{$focus_aaid} }) + : '(none)'; + print STDERR " existing ECs on this protein: $own_ecs\n"; + if ($n_annotated > 0) { + print STDERR " cluster members with ECs:\n"; + for my $other (sort keys %protein_ecs_in_cluster) { + my $osrc = $protein_info{$other}{source_id}; + my @norm = normalize_ecs(@{ $protein_ecs_in_cluster{$other} }); + printf STDERR " %s raw=[%s] normalized=[%s]\n", + $osrc, + join(', ', @{ $protein_ecs_in_cluster{$other} }), + join(', ', @norm); + } + print STDERR " EC vote results:\n"; + for my $ec (sort keys %ec_support) { + my $score = $ec_support{$ec} / $n_annotated; + my $verdict = $score >= $threshold ? 'PASS' : 'fail'; + printf STDERR " EC=%s %d/%d = %.2f %s\n", + $ec, $ec_support{$ec}, $n_annotated, $score, $verdict; + } + } else { + print STDERR " no annotated proteins in this cluster — nothing to propagate\n"; + } + if (@passing_ecs) { + print STDERR " ECs to be assigned:\n"; + my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$focus_aaid} // [] }; + for my $pass (@passing_ecs) { + my $novel = $existing_ecs{$pass->{ec}} ? 'already annotated' : 'NOVEL'; + printf STDERR " EC=%s score=%.2f (%d/%d) %s\n", + $pass->{ec}, $pass->{score}, $pass->{support}, $n_annotated, $novel; + } + } else { + print STDERR " no ECs passed threshold — nothing assigned\n"; + } + print STDERR "\n"; + } + } + next unless @passing_ecs; for my $aaid (@cluster) { diff --git a/Load/bin/newEcPrediction.pl b/Load/bin/newEcPrediction.pl deleted file mode 100644 index 71aebd913..000000000 --- a/Load/bin/newEcPrediction.pl +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/perl - -use warnings; -use strict; -use Data::Dumper; -use lib "$ENV{GUS_HOME}/lib/perl"; -use GUS::Supported::GusConfig; -use GUS::ObjRelP::DbiDatabase; -use CBIL::Util::PropertySet; -use DBI; - -my $outputFile = $ARGV[0]; -my $gusConfigFile = $ARGV[1]; - -$gusConfigFile = "$ENV{GUS_HOME}/config/gus.config" unless ($gusConfigFile); -my $gusconfig = GUS::Supported::GusConfig->new($gusConfigFile); - -my $verbose; -my $db = GUS::ObjRelP::DbiDatabase->new($gusconfig->getDbiDsn(), - $gusconfig->getDatabaseLogin(), - $gusconfig->getDatabasePassword(), - $verbose,0,1, - $gusconfig->getCoreSchemaName() - ); - -my $dbh = $db->getQueryHandle(); - -open (OUT, ">$outputFile") || die "can not open $outputFile to write\n"; - -# Get all groups -my @groups = @{ &getGroups($dbh) }; - -my $counter = 0; - -# For every group -foreach my $groupId (@groups) { - - $counter += 1; - - if ($counter % 1000 == 0) { - print "$counter groups processed...\n"; - } - - # Get all proteins from that group with an EC assignment - my %proteinToEc = %{ &getProteinToEC($dbh,$groupId) }; - - # Get all proteins that have a significant hit to the group centroid. Note, this is stored as the absolute log of the evalue. - my %proteinToEValue = %{ &getProteinToEValue($dbh,$groupId) }; - - my %ecBestEvalue; - my %processedProteins; - - # For every EC assignment in this group, record the best blast evalue to the centroid - # For each protein - foreach my $protein (keys %proteinToEc) { - # This protein gets a score of 4, as it is assigned this EC - print OUT "$protein\t$proteinToEc{$protein}\t4\n"; - # If we already have an protein with this EC, and it has a significant blast to the centroid - if ($ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) { - if ($ecBestEvalue{$proteinToEc{$protein}} < $proteinToEValue{$protein}) { - $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein}; - } - } - # Another protein assigned this EC, but doesn't have a significant blast hit with the centroid - elsif ($ecBestEvalue{$proteinToEc{$protein}} && !$proteinToEValue{$protein}) { - next; - } - # First protein with this EC, and it has a significant blast value to the centroid - elsif (!$ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) { - $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein}; - } - # First protein with this EC, and it does not have a significant blast value to the centroid - else { - $ecBestEvalue{$proteinToEc{$protein}} = 5; - } - # Keeps track of which proteins we have already scored (4 as they have this EC assigned) - $processedProteins{$protein} = 1; - } - - # For every protein in the group that has a significant blast hit to the centroid, we assign a score (if it had an EC assignment, it already got a 4) - foreach my $protein (keys %proteinToEValue) { - # We have already assigned this a 4 - next if ($processedProteins{$protein}); - # It already cannot get a score of 1 - next if (!$proteinToEValue{$protein}); - next if ($proteinToEValue{$protein} < 10); - # For every EC - foreach my $ec (keys %ecBestEvalue) { - # If the protein to centroid blast was > 100 - if ($proteinToEValue{$protein} > 100) { - # And the best blast to the centroid for all proteins with this EC was > 100 - if ($ecBestEvalue{$ec} > 100) { - # This protein EC combo scores a 3 - print OUT "$protein\t$ec\t3\n"; - } - # If the the best blast to the centroid for all proteins with this EC was > 50 - elsif ($ecBestEvalue{$ec} > 50) { - # This protein EC combo scores a 2 - print OUT "$protein\t$ec\t2\n"; - } - # If the the best blast to the centroid for all proteins with this EC was > 10 - elsif ($ecBestEvalue{$ec} >= 10) { - # This protein EC combo scores a 1 - print OUT "$protein\t$ec\t1\n"; - } - } - # Same process for this but this protein EC combo can only score a max of a 2 - elsif ($proteinToEValue{$protein} > 50) { - if ($ecBestEvalue{$ec} > 50) { - print OUT "$protein\t$ec\t2\n"; - } - elsif ($ecBestEvalue{$ec} >= 10) { - print OUT "$protein\t$ec\t1\n"; - } - } - # Same process for this but this protein EC combo can only score a max of a 1 - elsif ($proteinToEValue{$protein} >= 10) { - if ($ecBestEvalue{$ec} >= 10) { - print OUT "$protein\t$ec\t1\n"; - } - } - } - } -} - -close OUT; - -sub getGroups { - my ($dbh) = @_; - my $sql = "SELECT DISTINCT(group_name) FROM webready.ProteinSequence_pgroup"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - while(my ($groupId) = $sh->fetchrow_array()) { - push(@groups,$groupId); - } - $sh->finish(); - return \@groups; -} - -sub getProteinToEValue { - my ($dbh,$groupId) = @_; - my $sql = "SELECT qseq,evalue FROM apidb.orthogroupblastvalue - WHERE group_id = '$groupId'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my %proteinToEValue; - while(my ($protein,$evalue) = $sh->fetchrow_array()) { - if ($evalue eq '0.0') { - $evalue = 1.0e-300; - } - $proteinToEValue{$protein} = abs_log10($evalue); - } - $sh->finish(); - - return \%proteinToEValue; -} - -sub getProteinToDomain { - my ($dbh,$groupId) = @_; - my $sql = "SELECT full_id,accession FROM webready.ProteinDomainAssignment - WHERE group_name = '$groupId'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my %proteinToDomain; - while(my ($protein,$domain) = $sh->fetchrow_array()) { - $proteinToDomain{$protein} = $domain; - } - $sh->finish(); - - return \%proteinToDomain; -} - -sub getProteinToEC { - my ($dbh,$groupId) = @_; - my $sql = "SELECT pda.full_id,ec.ec_number - FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda - WHERE ec.enzyme_class_id = aaec.enzyme_class_id - AND aaec.aa_sequence_id = pda.aa_sequence_id - AND pda.group_name = '$groupId'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my %proteinToEC; - while(my ($protein,$ec) = $sh->fetchrow_array()) { - $proteinToEC{$protein} = $ec; - } - $sh->finish(); - - return \%proteinToEC; -} - -sub abs_log10 { - my ($num) = @_; - - # Handle zero or negative input - return undef if $num <= 0; - - # Take the log base 10 and return absolute value, rounded - return int(abs(log($num) / log(10)) + 0.5); # round to nearest int -} diff --git a/Load/bin/orthomclEcPrediction b/Load/bin/orthomclEcPrediction deleted file mode 100644 index b8971573e..000000000 --- a/Load/bin/orthomclEcPrediction +++ /dev/null @@ -1,1478 +0,0 @@ -#!/usr/bin/perl - -use warnings; -use strict; -use Data::Dumper; -use lib "$ENV{GUS_HOME}/lib/perl"; -use CBIL::Util::PropertySet; -use DBI; - -my $outputDir = $ARGV[0]; -my $outputFile = $ARGV[1]; -my $gusConfigFile = $ARGV[2]; - -my $minNumProteinsWithEc = 1; # if not testing, then set to 1 -my $maxNumProteinsWithEc = 0; # if not testing, then set to 0 -my $minNumGeneraForViableEc = 1; -my $minNumProteinsForViableEc = 1; -my $excludeOld = 1; -my $createStatsFile = 0; # if not testing, then set to 0 -my $createProteinFile = 0; # if not testing, then set to 0 -my $createLogFile = 0; # if not testing, then set to 0 and the log will go to STDERR -# If this is ever run, it will fail -my $test = 0; # if not testing, then set to 0 -my $fractionOfGroups = 1; # if not testing, then set to 1 - -my ($testFraction,$totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) = &setUpTest($outputDir) if ($test); - -my ($logFh,$netGroupsRef,$dbh,$groups) = &setUpPrediction($outputDir,$fractionOfGroups,$minNumProteinsWithEc,$maxNumProteinsWithEc,$createLogFile, $gusConfigFile); - -if (!$test) { - open(my $outFh, ">", $outputFile) || die "Cannot open file '$outputFile' for writing"; - close $outFh; -} - -foreach my $group (keys %{$groups}) { - next if (&nextGroup($fractionOfGroups,$netGroupsRef,$logFh)); - - my ($statsFh,$proteinFile) = &makeGroupDirAndFiles($outputDir,$group,$createStatsFile,$createProteinFile); - my ($proteinIds,$numTotalProteins,$numTotalGenera) = &getProteinInfo($group,$excludeOld,$dbh,$proteinFile); - - my ($trainingIds,$testIds); - ($trainingIds,$testIds) = &getTrainingAndTestIds($proteinIds,$testFraction) if ($test); - next if ($test && ($trainingIds eq "" || $testIds eq "")); - - my $viableEcNumbers = &getViableEcNumbers($proteinIds,$test,$trainingIds,$minNumGeneraForViableEc,$minNumProteinsForViableEc,$statsFh); - - my ($domainStatsPerEc,$domainPerProtein) = &ecDomainStats($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh); - my $lengthStatsPerEc = &ecLengthStats($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh); - my ($blastStatsPerEc,$blastStatsPerProtein,$proteinToEc) = &ecBlastStats($dbh,$group); - - close $statsFh if ($statsFh); - - my $scores = &getProteinScores($proteinIds,$numTotalProteins,$numTotalGenera,$test,$testIds,$viableEcNumbers,$domainStatsPerEc,$domainPerProtein,$lengthStatsPerEc,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc,$outputFile,$dbh); - - &testScores($scores,$proteinIds,$testIds,$totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh,$logFh) if ($test); -} - -&printTest($totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) if ($test); - -$dbh->disconnect(); - -exit; - -################################ SUBROUTINES ######################################## - -sub setUpPrediction { - my ($outputDirectory,$fractionOfGroups,$minNumProteinsWithEc,$maxNumProteinsWithEc,$createLogFile, $gusConfigFile) = @_; - my $dbh = &getDbHandle($gusConfigFile); - my $groups = &getGroupsFromDatabase($minNumProteinsWithEc,$maxNumProteinsWithEc,$dbh); - - my $logFh; - my $numGroups = keys %{$groups}; - my $netGroups = sprintf("%.0f",$fractionOfGroups * $numGroups); - if ($createLogFile) { - my $logFile = "$outputDirectory/log.txt"; - open($logFh,">",$logFile) || die "Cannot open file '$logFile' for writing"; - } - print {$logFh ? $logFh : *STDERR } "Start time: ".localtime()."\n"; - print {$logFh ? $logFh : *STDERR } "Groups must have between $minNumProteinsWithEc and $maxNumProteinsWithEc proteins with EC numbers\n"; - print {$logFh ? $logFh : *STDERR } "Total groups: $numGroups Fraction: $fractionOfGroups Net number of groups: $netGroups\n"; - return ($logFh,\$netGroups,$dbh,$groups); -} - -sub nextGroup { - my ($fractionOfGroups,$netGroupsRef,$logFh) = @_; - return 1 if (rand(1) >= $fractionOfGroups); - print {$logFh ? $logFh : *STDERR } "Approx number groups remaining: ${$netGroupsRef}\n"; - ${$netGroupsRef}--; - return 0; -} - -sub setUpTest { - my ($outputDirectory) = @_; - my $testFraction = 0.3; - my $totalEcsTested=0; - my $noEcMatch=0; - my $numProteinsExtraEc=0; - my $numExtraEc=0; - my $numTotalProteins=0; - my %testExact; - my %testLessPrecise; - my %testMorePrecise; - my $testFile = "$outputDirectory/test.txt"; - open(my $testFh,">",$testFile) || die "Cannot open $testFile for writing\n"; - return (\$testFraction,\$totalEcsTested,\$noEcMatch,\%testExact,\%testLessPrecise,\%testMorePrecise,\$numProteinsExtraEc,\$numExtraEc,\$numTotalProteins,$testFh); -} - - -sub getTrainingAndTestIds { - my ($proteinIds,$testFraction) = @_; - my ($trainingIds,$testIds); - - my $numProteinsWithEc = &numProteinsWithEc($proteinIds); - return ("","") if $numProteinsWithEc < 2; - my $uniqueEcNumbers = &getUniqueEcNumbersFromProteins($proteinIds,0,0); - foreach my $uniqueEc (keys %{$uniqueEcNumbers}) { - return ("","") if ($uniqueEcNumbers->{$uniqueEc} < 2); - } - my $numProteinsTest = sprintf("%.0f",${$testFraction} * $numProteinsWithEc); - $numProteinsTest = 1 if ($numProteinsTest < 1); - $numProteinsTest = $numProteinsWithEc - 1 if ($numProteinsTest == $numProteinsWithEc); - my $testIdPositions = &getTestIdPositions($numProteinsTest,$numProteinsWithEc); - - my $proteinCounter = 1; - foreach my $id (keys %{$proteinIds}) { - next if (scalar @{$proteinIds->{$id}->{ec}} == 0); - if (exists $testIdPositions->{$proteinCounter}) { - $testIds->{$id} = 1; - } else { - $trainingIds->{$id} = 1; - } - $proteinCounter++; - } - return("","") if (&ecsNotInTrainingAndTest($proteinIds,$trainingIds,$testIds,$uniqueEcNumbers)); - return ($trainingIds,$testIds); -} - -sub ecsNotInTrainingAndTest { - my ($proteinIds,$trainingIds,$testIds,$uniqueEcNumbers) = @_; - foreach my $uniqueEc (keys %{$uniqueEcNumbers}) { - my $found=0; - foreach my $id (keys %{$trainingIds}) { - foreach my $idEc (@{$proteinIds->{$id}->{ec}}) { - if ($idEc eq $uniqueEc) { - $found=1; - last; - } - } - last if ($found == 1); - } - return 1 if ($found == 0); - foreach my $id (keys %{$testIds}) { - foreach my $idEc (@{$proteinIds->{$id}->{ec}}) { - if ($idEc eq $uniqueEc) { - $found=2; - last; - } - } - last if ($found == 2); - } - return 1 if ($found == 1); - } - return 0; -} - - -sub getTestIdPositions { - my ($numProteinsTest,$numProteinsWithEc) = @_; - my %testPositions; - while (keys %testPositions < $numProteinsTest ) { - my $randomNumber = int(rand($numProteinsWithEc)) + 1; - next if (exists $testPositions{$randomNumber}); - $testPositions{$randomNumber} = 1; - } - return \%testPositions; -} - -sub makeGroupDirAndFiles { - my ($outputDirectory,$group,$createStatsFile,$createProteinFile) = @_; - &makeDir("$outputDirectory/$group") if ($createStatsFile || $createProteinFile); - my $statsFh; - if ($createStatsFile) { - my $statsFile = "$outputDirectory/$group/stats.txt"; - open($statsFh,">",$statsFile) || die "Cannot open $statsFile for writing\n"; - } - my $proteinFile; - if ($createProteinFile) { - $proteinFile = "$outputDirectory/$group/proteins.txt"; - } - return ($statsFh,$proteinFile); -} - -sub abs_log10 { - my ($num) = @_; - - # Handle zero or negative input - return undef if $num <= 0; - - # Take the log base 10 and return absolute value, rounded - return int(abs(log($num) / log(10)) + 0.5); -} - -sub getAbbrev { - my ($dbh,$Id) = @_; - my $sql = "SELECT taxon_abbreviation FROM webready.ProteinSequenceGroup WHERE full_id = '$Id'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my $abbrev; - while(my $taxon_abbrev = $sh->fetchrow_array()) { - $abbrev = $taxon_abbrev - } - $sh->finish(); - - return $abbrev; -} - -sub getProteinToEValue { - my ($dbh,$groupId) = @_; - my $sql = "SELECT qseq, evalue FROM apidb.orthologgroupblastvalue WHERE group_id = '$groupId'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my %proteinToEValue; - while(my ($protein,$evalue) = $sh->fetchrow_array()) { - if ($evalue eq '0.0') { - $evalue = 1.0e-300; - } - $proteinToEValue{$protein} = abs_log10($evalue); - } - $sh->finish(); - - return \%proteinToEValue; -} - -sub getProteinToEC { - my ($dbh,$groupId) = @_; - my $sql = "SELECT pda.full_id,ec.ec_number - FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda - WHERE ec.enzyme_class_id = aaec.enzyme_class_id - AND aaec.aa_sequence_id = pda.aa_sequence_id - AND pda.group_name = '$groupId'"; - my $sh = $dbh->prepare($sql); - $sh->execute(); - - my %proteinToEC; - while(my ($protein,$ec) = $sh->fetchrow_array()) { - $proteinToEC{$protein} = $ec; - } - $sh->finish(); - - return \%proteinToEC; -} - - -sub ecBlastStats { - my ($dbh,$group) = @_; - - # Get all proteins from that group with an EC assignment - my %proteinToEc = %{ &getProteinToEC($dbh,$group) }; - - # Get all proteins that have a significant hit to the group centroid. Note, this is stored as the absolute log of the evalue. - my %proteinToEValue = %{ &getProteinToEValue($dbh,$group) }; - - my %ecBestEvalue; - - # For every EC assignment in this group, record the best blast evalue to the centroid - # For each protein - foreach my $protein (keys %proteinToEc) { - # If we already have an protein with this EC, and it has a significant blast to the centroid - if ($ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) { - if ($ecBestEvalue{$proteinToEc{$protein}} < $proteinToEValue{$protein}) { - $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein}; - } - } - # Another protein assigned this EC, but doesn't have a significant blast hit with the centroid - elsif ($ecBestEvalue{$proteinToEc{$protein}} && !$proteinToEValue{$protein}) { - next; - } - # First protein with this EC, and it has a significant blast value to the centroid - elsif (!$ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) { - $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein}; - } - # First protein with this EC, and it does not have a significant blast value to the centroid - else { - $ecBestEvalue{$proteinToEc{$protein}} = 5; - } - } - - return (\%ecBestEvalue,\%proteinToEValue,\%proteinToEc); -} - -sub proteinDoesNotExist { - my ($proteinName,$proteinIds,$statsFh,$missingHashRef) = @_; - - if (! exists $proteinIds->{$proteinName}) { - if ($statsFh && ! exists $missingHashRef->{$proteinName}) { - print $statsFh "ERROR: The protein '$proteinName' does not exist in the original protein ids\n"; - $missingHashRef->{$proteinName}=1; - } - return 1; - } - return 0; -} - -sub numProteinsWithDomainAndEc { - my ($proteinIds) = @_; - my $numProteins = 0; - foreach my $id (keys %{$proteinIds}) { - my $numDomains = scalar @{$proteinIds->{$id}->{domain}}; - my $numEcs = scalar @{$proteinIds->{$id}->{ec}}; - if ($numDomains > 0 && $numEcs > 0) { - $numProteins++; - } - } - return $numProteins; -} - -sub numProteinsWithEc { - my ($proteinIds) = @_; - my $numProteins = 0; - foreach my $id (keys %{$proteinIds}) { - my $numEcs = scalar @{$proteinIds->{$id}->{ec}}; - $numProteins++ if ($numEcs > 0); - } - return $numProteins; -} - -sub getViableEcNumbers { - my ($proteinIds,$test,$trainingIds,$minNumGenera,$minNumProteins,$statsFh) = @_; - - my $actualEcNumbers = &getUniqueEcNumbersFromProteins($proteinIds,$test,$trainingIds); - #my $allEcNumbers = &addPartialEcNumbers($actualEcNumbers); - my $ecNumbersWithCounts = &getNumProteinsGeneraForEachEc($proteinIds,$test,$trainingIds,$actualEcNumbers); - &deletePartialEcNumbers($ecNumbersWithCounts); - &deleteEcNumbersBelowMin($ecNumbersWithCounts,$minNumProteins,$minNumGenera); - &printViableEcNumbers($ecNumbersWithCounts,$statsFh) if ($statsFh); - - return $ecNumbersWithCounts; -} - -sub printViableEcNumbers { - my ($ecNumbersWithCounts,$statsFh) = @_; - print $statsFh "EC_NUMBER\tNUM_PROTEINS\tNUM_GENERA\n"; - foreach my $ec (sort keys %{$ecNumbersWithCounts}) { - print $statsFh "$ec\t$ecNumbersWithCounts->{$ec}->{numProteins}\t$ecNumbersWithCounts->{$ec}->{numGenera}\n"; - } - print $statsFh "\n"; -} - -sub getUniqueEcNumbersFromProteins { - my ($proteinIds,$test,$trainingIds) = @_; - my $ecs; - foreach my $id (keys %{$proteinIds}) { - next if ($test && ! exists $trainingIds->{$id}); - foreach my $ec ( @{$proteinIds->{$id}->{ec}} ) { - $ecs->{$ec}++; - } - } - return $ecs; -} - -sub validEcNumber { - my ($ec) = @_; - if ( $ec =~ /^[0-9]+\.[0-9]+\.[0-9\-]+\.[0-9\-]+$/ ) { - return 1; - } else { - return 0; - } -} - -sub addPartialEcNumbers { - my ($ecs) = @_; - my $allEcs; - foreach my $ec (keys %{$ecs}) { - $allEcs->{$ec} = 1; - my ($a,$b,$c,$d) = split(/\./,$ec); - $allEcs->{"$a.$b.$c.-"} = 1; - $allEcs->{"$a.$b.-.-"} = 1; - } - return $allEcs; -} - -sub getNumProteinsGeneraForEachEc { - my ($proteinIds,$test,$trainingIds,$allEcNumbers) = @_; - my $ecNumbersWithCounts; - my $ecGenera; - foreach my $id (keys %{$proteinIds}) { - next if ($test && ! exists $trainingIds->{$id}); - my $genus = &getGenusFromProtein($proteinIds->{$id}); - foreach my $ec ( keys %{$allEcNumbers} ) { - if (&proteinHasThisEcNumber($proteinIds->{$id},$ec)) { - $ecNumbersWithCounts->{$ec}->{numProteins}++; - $ecGenera->{$ec} = &addArrayElement($ecGenera->{$ec},$genus); - } - } - } - - &addNumberGenera($ecNumbersWithCounts,$ecGenera); - return $ecNumbersWithCounts; -} - -sub getGenusFromProtein { - my ($protein) = @_; - my @taxonArray = split(" ",$protein->{taxon}); - return $taxonArray[0]; -} - -sub addArrayElement { - my ($arrayRef,$element) = @_; - if (defined $arrayRef) { - push @{$arrayRef}, $element; - } else { - $arrayRef = [$element]; - } - return $arrayRef; -} - -sub addNumberGenera { - my ($ecNumbersWithCounts,$ecGenera) = @_; - foreach my $ec (keys %{$ecNumbersWithCounts}) { - my %allGenera = map { $_ => 1 } @{$ecGenera->{$ec}}; - my $numGenera = scalar keys %allGenera; - $ecNumbersWithCounts->{$ec}->{numGenera} = $numGenera - } -} - -sub deleteEcNumbersBelowMin { - my ($ecs,$minNumProteins,$minNumGenera) = @_; - my %toDelete; - foreach my $ec (keys %{$ecs}) { - if ($ecs->{$ec}->{numProteins} < $minNumProteins || $ecs->{$ec}->{numGenera} < $minNumGenera) { - $toDelete{$ec} = 1; - } - } - foreach my $ec (keys %toDelete) { - delete $ecs->{$ec}; - } -} - -sub deletePartialEcNumbers { - my ($ecs) = @_; - my %toDelete; - foreach my $ec (keys %{$ecs}) { - my $parentEc = &getParent($ec); - if ($parentEc && exists $ecs->{$parentEc}) { - if ($ecs->{$ec}->{numProteins} == $ecs->{$parentEc}->{numProteins}) { - $toDelete{$parentEc} = 1; # if parent has same number proteins then delete parent - } - } - } - foreach my $ec (keys %toDelete) { - delete $ecs->{$ec}; - } -} - -sub getParent { - my ($ec) = @_; - return "" if ($ec eq ""); - my ($a,$b,$c,$d) = split(/\./,$ec); - return "" if ($c eq "-"); - return "$a.$b.-.-" if ($d eq "-"); - return "$a.$b.$c.-"; -} - -sub getAllAncestors { - my ($ec,$ancestorEcs) = @_; - my $parentEc = &getParent($ec); - if ($parentEc ne "") { - $ancestorEcs->{$parentEc} = 1; - &getAllAncestors($parentEc,$ancestorEcs); - } -} - -sub readDomainCountFile { - my ($domainCountFile) = @_; - my $domainCount; - - open(IN,$domainCountFile) || die "Cannot open $domainCountFile\n"; - my $header = ; - my $totalLine = ; - chomp($totalLine); - my ($text,$numProteins) = split("\t",$totalLine); - $domainCount->{numProteins} = $numProteins; - while (my $line=) { - chomp($line); - my ($domain,$num) = split("\t",$line); - $domainCount->{domain}->{$domain} = $num; - } - close IN; - return $domainCount; -} - -sub ecLengthStats { - my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh) = @_; - - # for each viable EC number, obtain array of protein lengths - my $ecNumbers; - foreach my $id (keys %{$proteinIds}) { - next if ( scalar @{$proteinIds->{$id}->{ec}} == 0 ); - next if ($test && ! exists $trainingIds->{$id}); - foreach my $viableEc ( keys %{$viableEcNumbers} ) { - if (&proteinHasThisEcNumber($proteinIds->{$id},$viableEc)) { - $ecNumbers->{$viableEc} = &addArrayElement($ecNumbers->{$viableEc},$proteinIds->{$id}->{length}); - } - } - } - - # for each EC number, calculate min, max, avg, median - my $ecStats; - foreach my $ec (keys %{$ecNumbers}) { - my $noValues = scalar @{$ecNumbers->{$ec}} == 0 ? 1 : 0; - $ecStats->{$ec}->{numProteins} = scalar @{$ecNumbers->{$ec}}; - $ecStats->{$ec}->{min} = $noValues ? -1 : min($ecNumbers->{$ec}); - $ecStats->{$ec}->{max} = $noValues ? -1 : max($ecNumbers->{$ec}); - $ecStats->{$ec}->{median} = $noValues ? -1 : median($ecNumbers->{$ec}); - my ($mean,$sd) = meanSd($ecNumbers->{$ec}); - $ecStats->{$ec}->{mean} = $noValues ? -1 : $mean; - $ecStats->{$ec}->{sd} = $noValues ? -1 : $sd; - } - - &printEcLengthStats($ecStats,$statsFh) if ($statsFh); - - return $ecStats; -} - -sub printEcLengthStats { - my ($ecStats,$statsFh) = @_; - print $statsFh "EC_NUMBER\tNUM_PROTEINS\tMIN_LENGTH\tMAX_LENGTH\t_MEDIAN_LENGTH\tMEAN_LENGTH\tSTD_DEV_LENGTH\n"; - foreach my $ec (sort keys %{$ecStats}) { - print $statsFh "$ec\t$ecStats->{$ec}->{numProteins}"; - print $statsFh "\t$ecStats->{$ec}->{min}\t$ecStats->{$ec}->{max}"; - print $statsFh "\t$ecStats->{$ec}->{median}\t$ecStats->{$ec}->{mean}"; - print $statsFh "\t$ecStats->{$ec}->{sd}\n"; - } - print $statsFh "\n"; -} - -sub min { - my ($arrayRef) = @_; - my $min = $arrayRef->[0]; - foreach my $number (@{$arrayRef}) { - $min = $number if ($number < $min); - } - return $min; -} - -sub max { - my ($arrayRef) = @_; - my $max = $arrayRef->[0]; - foreach my $number (@{$arrayRef}) { - $max = $number if ($number > $max); - } - return $max; -} - -sub meanSd { - my ($arrayRef) = @_; - my $numSamples = scalar @{$arrayRef}; - my $sum = &sum($arrayRef); - my $mean = $sum/$numSamples; - - $sum=0; - foreach my $number (@{$arrayRef}) { - $sum += ($number-$mean)**2; - } - my $sd = sqrt($sum/$numSamples); - - return ($mean,$sd); -} - -sub sum { - my ($arrayRef) = @_; - my $sum = 0; - foreach my $number (@{$arrayRef}) { - $sum += $number; - } - return $sum; -} - -sub median { - my ($arrayRef) = @_; - my @sorted = sort {$a <=> $b} @{$arrayRef}; - my $length = scalar @{$arrayRef}; - if ( $length%2 ) { - return $sorted[int($length/2)]; - } else { - return ($sorted[int($length/2)-1] + $sorted[int($length/2)])/2; - } -} - -sub proteinHasThisEcNumber { - my ($protein,$ecNumber) = @_; - my ($a1,$b1,$c1,$d1) = split(/\./,$ecNumber); - foreach my $currentEc ( @{$protein->{ec}} ) { - my ($a2,$b2,$c2,$d2) = split(/\./,$currentEc); - $c2 = "-" if ($c1 eq "-"); - $d2 = "-" if ($d1 eq "-"); - if (($a1 eq $a2) && ($b1 eq $b2) && ($c1 eq $c2) && ($d1 eq $d2)) { - return 1; - } - } - return 0; -} - -sub ecDomainStats { - my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh) = @_; - - my $domainToLetter = &getDomainKey($proteinIds,$statsFh); - - my ($ecNumbers,$domainPerProtein) = &getAllDomainStringsPerEc($proteinIds,$test,$trainingIds,$viableEcNumbers,$domainToLetter); - &calculateDomainNumAndScore($ecNumbers,$viableEcNumbers); - &calculateDomainMaxScore($ecNumbers); - - &printEcDomainStats($ecNumbers,$statsFh) if ($statsFh); - &printProteinDomains($domainPerProtein,$statsFh) if ($statsFh); - - return ($ecNumbers,$domainPerProtein); -} - -sub printProteinDomains { - my ($domainPerProtein,$statsFh) = @_; - print $statsFh "GENE\tDOMAINS\n"; - foreach my $id (keys %{$domainPerProtein}) { - my $domainString = $domainPerProtein->{$id}; - print $statsFh "$id\t$domainString\n"; - } - print $statsFh "\n"; -} - -sub printEcDomainStats { - my ($ecNumbers,$statsFh) = @_; - print $statsFh "EC_NUMBER\tDOMAIN_STRING\tNUM_PROTEINS\tSCORE\n"; - foreach my $ec (sort keys %{$ecNumbers}) { - print $statsFh "$ec\t--NUM PROTEINS--\t$ecNumbers->{$ec}->{numProteins}\t$ecNumbers->{$ec}->{maxScore}\n"; - foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) { - print $statsFh "$ec\t$string"; - print $statsFh "\t$ecNumbers->{$ec}->{domainString}->{$string}->{count}"; - print $statsFh "\t$ecNumbers->{$ec}->{domainString}->{$string}->{score}\n"; - } - } - print $statsFh "\n"; -} - -sub getAllDomainStringsPerEc { - my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$domainToLetter) = @_; - my ($ecNumbers,$domainPerProtein); - - foreach my $id (keys %{$proteinIds}) { - my $domainString = &getDomain($proteinIds->{$id},$domainToLetter); - $domainPerProtein->{$id} = $domainString; - next if (scalar @{$proteinIds->{$id}->{ec}} == 0); - next if ($test && ! exists $trainingIds->{$id}); - my $domains = &getAllPossibleCombinations($domainString,""); - foreach my $viableEc ( keys %{$viableEcNumbers} ) { - if (&proteinHasThisEcNumber($proteinIds->{$id},$viableEc)) { - foreach my $domain (keys %{$domains}) { - $ecNumbers->{$viableEc}->{domainString}->{$domain}->{count}++; - } - } - } - } - return ($ecNumbers,$domainPerProtein); -} - -sub calculateDomainNumAndScore { - my ($ecNumbers,$viableEcNumbers) = @_; - foreach my $ec (keys %{$ecNumbers}) { - $ecNumbers->{$ec}->{numProteins} = $viableEcNumbers->{$ec}->{numProteins}; - foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) { - $ecNumbers->{$ec}->{domainString}->{$string}->{score} = $ecNumbers->{$ec}->{domainString}->{$string}->{count} / $viableEcNumbers->{$ec}->{numProteins}; - } - } -} - -sub calculateDomainMaxScore { - my ($ecNumbers) = @_; - foreach my $ec (keys %{$ecNumbers}) { - $ecNumbers->{$ec}->{maxScore} = 0; - foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) { - $ecNumbers->{$ec}->{maxScore} += $ecNumbers->{$ec}->{domainString}->{$string}->{score}; - } - } -} - -sub getDomain { - my ($proteinRef,$domainToLetter) = @_; - my $domain = "-"; - if (scalar @{$proteinRef->{domain}} > 0) { - my @domainLetters = map { $domainToLetter->{$_} } @{$proteinRef->{domain}}; - $domain = join("",@domainLetters); - } - return $domain; -} - -sub numProteinScore { - my ($numProteinsWithEc) = @_; - if ($numProteinsWithEc > 1) { - return 1; - } else { - return 0; - } -} - -sub numGeneraScore { - my ($numGeneraWithEc) = @_; - if ($numGeneraWithEc > 1) { - return 1; - } else { - return 0; - } -} - -sub domainScore { - my ($id,$ec,$domainStatsPerEc,$domainPerProtein) = @_; - my $score=0; - my $idDomain = $domainPerProtein->{$id}; - die "There is no domain for $id. Testing EC $ec.\n" if (! defined $idDomain); - die "There is no domainStatsPerEc for $ec. Testing $id.\n" if (! exists $domainStatsPerEc->{$ec}); - - foreach my $domainString ( keys %{$domainStatsPerEc->{$ec}->{domainString}} ) { - if ($idDomain =~ /$domainString/) { - $score += $domainStatsPerEc->{$ec}->{domainString}->{$domainString}->{score}; - } - } - my $normalizedScore = $score / $domainStatsPerEc->{$ec}->{maxScore}; - if ($normalizedScore > 0.8 ) { - return 4; - } elsif ($normalizedScore > 0.6) { - return 3; - } elsif ($normalizedScore > 0.4) { - return 2; - } elsif ($normalizedScore > 0.2) { - return 1; - } else { - return 0; - } -} - -sub lengthScore { - my ($id,$ec,$proteinIds,$lengthStatsPerEc) = @_; - - my $idLength = $proteinIds->{$id}->{length}; - my $idDistanceFromMedian = abs($idLength - $lengthStatsPerEc->{$ec}->{median}); - my $tenPercentOfMedian = 0.1 * $lengthStatsPerEc->{$ec}->{median}; - - if ($idDistanceFromMedian <= $tenPercentOfMedian ) { - return 4; - } elsif ($idDistanceFromMedian <= 2*$tenPercentOfMedian) { - return 3; - } elsif ($idDistanceFromMedian <= 3*$tenPercentOfMedian) { - return 2; - } elsif ($idLength >= $lengthStatsPerEc->{$ec}->{min} && $idLength <= $lengthStatsPerEc->{$ec}->{max}) { - return 1; - } else { - return 0; - } -} - -sub blastScore { - my ($id,$ec,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc) = @_; - #print "EC is $ec ID is $id\n"; - # This protein was already assigned this EC - if ($proteinToEc->{$id} && $proteinToEc->{$id} eq $ec) { - return 4; - } - # This protein has no significant hit to the best representative - if (!$blastStatsPerProtein->{$id}) { - return 0; - } - # This protein did not have a good hit to the best rep - elsif ($blastStatsPerProtein->{$id} < 10) { - return 0; - } - # If the protein to centroid blast was > 100 - elsif ($blastStatsPerProtein->{$id} > 100) { - if (!$blastStatsPerEc->{$ec}) { - return 0; - } - # And the best blast to the centroid for all proteins with this EC was > 100 - elsif ($blastStatsPerEc->{$ec} > 100) { - # This protein EC combo scores a 3 - return 3; - } - # If the best blast to the centroid for all proteins with this EC was > 50 - elsif ($blastStatsPerEc->{$ec} > 50) { - # This protein EC combo scores a 2 - return 2; - } - # If the best blast to the centroid for all proteins with this EC was >= 10 - elsif ($blastStatsPerEc->{$ec} >= 10) { - # This protein EC combo scores a 1 - return 1; - } - } - # Same process for this but this protein EC combo can only score a max of a 2 - elsif ($blastStatsPerProtein->{$id} > 50) { - if (!$blastStatsPerEc->{$ec}) { - return 0; - } - elsif ($blastStatsPerEc->{$ec} > 50) { - return 2; - } - elsif ($blastStatsPerEc->{$ec} >= 10) { - return 1; - } - } - # Same process for this but this protein EC combo can only score a max of a 1 - elsif ($blastStatsPerProtein->{$id} >= 10) { - if (!$blastStatsPerEc->{$ec}) { - return 0; - } - if ($blastStatsPerEc->{$ec} >= 10) { - return 1; - } - } -} - -sub getProteinScores { - my ($proteinIds,$numTotalProteins,$numTotalGenera,$test,$testIds,$viableEcNumbers,$domainStatsPerEc,$domainPerProtein,$lengthStatsPerEc,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc,$outputFile,$dbh) = @_; - - my $scores; - foreach my $id (keys %{$proteinIds}) { - next if ($test && ! exists $testIds->{$id}); - foreach my $ec (keys %{$viableEcNumbers}) { - $scores->{$id}->{$ec}->{numProteinsWithEc} = $viableEcNumbers->{$ec}->{numProteins}; - $scores->{$id}->{$ec}->{numGeneraWithEc} = $viableEcNumbers->{$ec}->{numGenera}; - $scores->{$id}->{$ec}->{lengthScore} = &lengthScore($id,$ec,$proteinIds,$lengthStatsPerEc); - $scores->{$id}->{$ec}->{blastScore} = &blastScore($id,$ec,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc); - $scores->{$id}->{$ec}->{domainScore} = &domainScore($id,$ec,$domainStatsPerEc,$domainPerProtein); - $scores->{$id}->{$ec}->{numTotalProteins} = $numTotalProteins; - $scores->{$id}->{$ec}->{numTotalGenera} = $numTotalGenera; - delete $scores->{$id}->{$ec} unless (&meetsScoreThreshold($scores->{$id}->{$ec})); - } - } - &printEcScores($scores,$outputFile,$dbh) if (!$test); - return $scores; -} - -sub meetsScoreThreshold { - my ($scoreRef) = @_; - if ($scoreRef->{lengthScore} >= 1 && $scoreRef->{blastScore} >= 1 && $scoreRef->{domainScore} >= 1) { - return 1; - } - return 0; -} - - -sub deletePartialEcWithWorseScore { - my ($scores) = @_; - foreach my $id (keys %{$scores}) { - my %toDelete; - foreach my $ec (keys %{$scores->{$id}}) { - my %ancestorEcs; - &getAllAncestors($ec,\%ancestorEcs); - next if (keys %ancestorEcs == 0); - - foreach my $ancestorEc (keys %ancestorEcs) { - if (exists $scores->{$id}->{$ancestorEc} && &sameOrWorseScore($scores->{$id},$ec,$ancestorEc)) { - $toDelete{$ancestorEc} = 1; - } - } - } - - foreach my $ec (keys %toDelete) { - delete $scores->{$id}->{$ec}; - } - } -} - -sub sameOrWorseScore { - my ($scoresRef,$ec1,$ec2) = @_; - return 1 if ($scoresRef->{$ec1}->{numProteinsWithEc} >= 2 && - $scoresRef->{$ec1}->{numGeneraWithEc} >= 2 && - $scoresRef->{$ec2}->{lengthScore} <= - $scoresRef->{$ec1}->{lengthScore} && - $scoresRef->{$ec2}->{blastScore} <= - $scoresRef->{$ec1}->{blastScore} && - $scoresRef->{$ec2}->{domainScore} <= - $scoresRef->{$ec1}->{domainScore}); - return 0; -} - -sub scoreToNumber { - my ($scoreRef) = @_; - my $score = $scoreRef->{domainScore} == -1 ? 0 : $scoreRef->{domainScore}/10; - $score += $scoreRef->{lengthScore} + $scoreRef->{blastScore}; - return $score; -} - -sub printEcScores { - my ($scores,$outputFile,$dbh) = @_; - foreach my $id (keys %{$scores}) { - my $editedId = $id; - my $abbrev; - if ($id =~ /^([a-z]{4})\|/) { - $abbrev = $1; - } else { - $abbrev = &getAbbrev($dbh,$id); - } - if ($abbrev eq "rhiz") { # this is temporary, because rhiz on orthomcl equals rirr on fungidb - $abbrev = "rirr"; - $editedId =~ s/^rhiz/rirr/; - } - open(OUT,">>",$outputFile) || die "Cannot open file '$outputFile' for appending"; - foreach my $ec (keys %{$scores->{$id}}) { - my $string = &scoresToString($scores->{$id}->{$ec}); - my $text = "$editedId\t$ec\t$string\n"; - print OUT $text; - } - close OUT; - } -} - -sub scoresToString { - my ($scoresRef) = @_; - my $a = $scoresRef->{numProteinsWithEc}; - my $b = $scoresRef->{numTotalProteins}; - my $c = $scoresRef->{numGeneraWithEc}; - my $d = $scoresRef->{numTotalGenera}; - my $e = $scoresRef->{lengthScore}; - my $f = $scoresRef->{blastScore}; - my $g = $scoresRef->{domainScore}; - return "$a\t$b\t$c\t$d\t$e\t$f\t$g"; -} - -sub testScores { - my ($scores,$proteinIds,$testIds,$totalEcsRef,$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh,$logFh) = @_; - my $group = &getGroupFromProteins($proteinIds,$logFh); - print $testFh "Group $group\n"; - foreach my $id (keys %{$testIds}) { - foreach my $thisIdEc (@{$proteinIds->{$id}->{ec}}) { - ${$totalEcsRef}++; - print $testFh "$id\t$thisIdEc\t"; - &testEcNumberMatch($thisIdEc,$scores->{$id},$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$testFh); - } - &testExtraEc($id,$proteinIds->{$id}->{ec},$scores->{$id},$numProteinsExtraEcRef,$numExtraEcRef,$testFh); - ${$numTotalProteinsRef}++; - } -} - -sub testExtraEc { - my ($id,$idEcs,$scoresForThisId,$numProteinsExtraEcRef,$numExtraEcRef,$testFh) = @_; - my $proteinNoMatch = 0; - foreach my $predictedEc (keys %{$scoresForThisId}) { - my $ecMatch = 0; - foreach my $idEc (@{$idEcs}) { - if ($idEc eq $predictedEc) { - $ecMatch = 1; - last; - } - } - if ($ecMatch == 0) { - $proteinNoMatch = 1; - my $string = &scoresToString($scoresForThisId->{$predictedEc}); - print $testFh "$id\t$predictedEc\t$string\tnew_assignment\n"; - ${$numExtraEcRef}++; - } - } - ${$numProteinsExtraEcRef}++ if ($proteinNoMatch == 1); -} - -sub testEcNumberMatch { - my ($thisIdEc,$scoresForThisId,$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$testFh) = @_; - if (exists $scoresForThisId->{$thisIdEc}) { #exact match - my $string = &scoresToString($scoresForThisId->{$thisIdEc}); - my $score = &scoreToNumber($scoresForThisId->{$thisIdEc}); - $testExact->{$score}++; - print $testFh "$string\texact\n"; - return; - } - if (&testMorePrecise($thisIdEc,$scoresForThisId,$testMorePrecise,$testFh)) { - return; - } - if (&testLessPrecise($thisIdEc,$scoresForThisId,$testLessPrecise,$testFh)) { - return; - } - &testNoMatch($noEcMatchRef,$scoresForThisId,$testFh); -} - -sub testNoMatch { - my ($noEcMatchRef,$scoresForThisId,$testFh) = @_; - ${$noEcMatchRef}++; - print $testFh "\t\t\t\t\t\t\tno_match,ECs_predicted: "; - foreach my $thisIdEc (sort keys %{$scoresForThisId}) { - my $score = &scoreToNumber($scoresForThisId->{$thisIdEc}); - print $testFh "$thisIdEc($score)"; - } - print $testFh "\n"; -} - -sub testLessPrecise { - my ($thisIdEc,$scoresForThisId,$testLessPrecise,$testFh) = @_; - my $thisIdParentEc = &getParent($thisIdEc); - if (! $thisIdParentEc) { - return 0; - } elsif (exists $scoresForThisId->{$thisIdParentEc}) { - my $string = &scoresToString($scoresForThisId->{$thisIdParentEc}); - my $score = &scoreToNumber($scoresForThisId->{$thisIdParentEc}); - print $testFh "$string\tprediction_less_precise: $thisIdParentEc\n"; - $testLessPrecise->{$score}++; - return 1; - } else { - return &testLessPrecise($thisIdParentEc,$scoresForThisId,$testLessPrecise,$testFh); - } -} - -sub testMorePrecise { - my ($thisIdEc,$scoresForThisId,$testMorePrecise,$testFh) = @_; - foreach my $predictedEc (keys %{$scoresForThisId}) { - if (&partialMatchEc($thisIdEc,$predictedEc)) { - my $string = &scoresToString($scoresForThisId->{$predictedEc}); - my $score = &scoreToNumber($scoresForThisId->{$predictedEc}); - print $testFh "$string\tprediction_more_precise: $predictedEc\n"; - $testMorePrecise->{$score}++; - return 1; - } - } - return 0; -} - -sub partialMatchEc { - my ($thisIdEc,$predictedEc) = @_; - my $predictedParentEc = &getParent($predictedEc); - if (! $predictedParentEc) { - return 0; - } elsif ($thisIdEc eq $predictedParentEc) { - return 1; - } else { - return &partialMatchEc($thisIdEc,$predictedParentEc); - } -} - -sub printTest { - my ($total,$noMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) = @_; - my $percentNoMatch = sprintf("%.1f",100 * ${$noMatch} / ${$total}); - my $numExactMatch = &sumHashValues($testExact); - my $percentExactMatch = sprintf("%.1f",100 * $numExactMatch / ${$total}); - my $numLessPreciseMatch = &sumHashValues($testLessPrecise); - my $percentLessPreciseMatch = sprintf("%.1f",100 * $numLessPreciseMatch / ${$total}); - my $numMorePreciseMatch = &sumHashValues($testMorePrecise); - my $percentMorePreciseMatch = sprintf("%.1f",100 * $numMorePreciseMatch / ${$total}); - my $percentProteinsExtraEc = sprintf("%.1f",100 * ${$numProteinsExtraEcRef} / ${$numTotalProteinsRef}); - print $testFh "\nSUMMARY\n"; - print $testFh "Total tested ECs\t${$total}\n"; - print $testFh "No match\t${$noMatch} ($percentNoMatch %)\n"; - print $testFh "Exact match\t$numExactMatch ($percentExactMatch %)\n"; - print $testFh "Less precise match\t$numLessPreciseMatch ($percentLessPreciseMatch %)\n"; - print $testFh "More precise match\t$numMorePreciseMatch ($percentMorePreciseMatch %)\n"; - print $testFh "Extra ECs\t${$numExtraEcRef}\n"; - print $testFh "\nTotal tested proteins\t${$numTotalProteinsRef}\n"; - print $testFh "Num proteins w/ extra predicted EC\t${$numProteinsExtraEcRef} ($percentProteinsExtraEc %)\n"; - print $testFh "\nExact predictions:\n"; - foreach my $score (sort { $b cmp $a } keys %{$testExact}) { - print $testFh "$score\t$testExact->{$score}\n"; - } - print $testFh "Less precise predictions:\n"; - foreach my $score (sort { $b cmp $a } keys %{$testLessPrecise}) { - print $testFh "$score\t$testLessPrecise->{$score}\n"; - } - print $testFh "More precise predictions:\n"; - foreach my $score (sort { $b cmp $a } keys %{$testMorePrecise}) { - print $testFh "$score\t$testMorePrecise->{$score}\n"; - } - - close $testFh; -} - -sub sumHashValues { - my ($hashRef) = @_; - my $sum = 0; - foreach my $key (keys %{$hashRef}) { - $sum += $hashRef->{$key}; - } - return $sum; -} - -sub getAllPossibleCombinations { - my ($domainString,$delimiter) = @_; - - my $domains; - my @domainArray = split(/$delimiter/,$domainString); - my $length = scalar @domainArray; - for (my $a=0; $a<$length; $a++) { - for (my $b=$a; $b<$length; $b++) { - my $string = join("$delimiter",@domainArray[$a..$b]); - $domains->{$string} = 1; - } - } - return $domains; -} - - -sub getDomainKey { - my ($proteinIds,$statsFh) = @_; - - my $domains; - - foreach my $protein (keys %{$proteinIds}) { - foreach my $domain (@{$proteinIds->{$protein}->{domain}}) { - $domains->{$domain}++; - } - } - - my @alphabet = ("A".."Z","a".."z",0..9); - my $numCharacters = scalar @alphabet; - my $numDomains = scalar keys %{$domains}; - # stringLength is how long string needs to be in order to capture all domains - # 1 if <= 62 domains, 2 if <= 3844, 3 if <= 238328, 4 if <= 14776336 (max length is set to 4) - my $stringLength = 0; - for (my $a=1; $a<=4; $a++) { - if ($numDomains <= $numCharacters**$a) { - $stringLength = $a; - last; - } - } - die "There are more than 14,776,336 domains in this group so cannot perform A-Z a-z 0-9 mapping" if ($stringLength == 0); - - my $stringCounter = &initializeStringCounter($stringLength); - my @domainArray = sort { $domains->{$b} <=> $domains->{$a} } keys %{$domains}; - - print $statsFh "DOMAIN\tSTRING\n" if ($statsFh); - for (my $currentDomain=0; $currentDomain<$numDomains; $currentDomain++) { - my $currentString = &makeStringFromCounter($stringCounter,\@alphabet); - $domains->{$domainArray[$currentDomain]} = $currentString; - print $statsFh "$domainArray[$currentDomain]\t$currentString\n" if ($statsFh); - $stringCounter = &increaseStringCounter($stringCounter,$numCharacters); - } - print $statsFh "\n" if ($statsFh); - - return $domains; -} - -sub initializeStringCounter { - my ($stringLength) = @_; - my $stringCounter; - for (my $a=0; $a<$stringLength; $a++) { - push @{$stringCounter}, 0; - } - return $stringCounter; -} - -sub makeStringFromCounter { - my ($stringCounter,$alphabet) = @_; - my @string = map { $alphabet->[$_] } @{$stringCounter}; - return join("",@string); -} - -sub increaseStringCounter { - my ($stringCounter,$numCharacters) = @_; - - my $numDigits = scalar @{$stringCounter}; - - for (my $i=($numDigits-1); $i>=0; $i--) { - if ( $stringCounter->[$i] == ($numCharacters-1) ) { - $stringCounter->[$i] = 0; - } else { - $stringCounter->[$i]++; - last; - } - } - return $stringCounter; - -} - -sub writeProteinInfoFile { - my ($proteinIds,$numTotalProteins,$numTotalGenera,$proteinInfoFile) =@_; - - open(OUT,">",$proteinInfoFile) || die "Cannot open '$proteinInfoFile' for writing"; - print OUT "Total num proteins\t$numTotalProteins\n"; - print OUT "Total num genera\t$numTotalGenera\n"; - print OUT "GROUP\tPROTEIN_ID\tTAXON\tLENGTH\tDOMAIN\tPRODUCT\tEC_NUMBER\n"; - foreach my $id (keys %{$proteinIds}) { - my $group = $proteinIds->{$id}->{group}; - my $length = $proteinIds->{$id}->{length}; - my $product = $proteinIds->{$id}->{product}; - my $taxon = $proteinIds->{$id}->{taxon}; - my @ecs = @{$proteinIds->{$id}->{ec}}; - $ecs[0] = "-" if (scalar @ecs == 0); - my $ec = join(",",@ecs); - my @domains = @{$proteinIds->{$id}->{domain}}; - $domains[0] = "-" if (scalar @domains == 0); - my $domain = join(",",@domains); - print OUT "$group\t$id\t$taxon\t$length\t$domain\t$product\t$ec\n"; - } - close OUT; -} - -sub readProteinInfoFile { - my ($proteinInfoFile,$excludeOld) = @_; - my $proteinIds; - open(IN,$proteinInfoFile) || die "Cannot open '$proteinInfoFile' for reading"; - my $numTotalProteins = &getSecondColumn(); - my $numTotalGenera = &getSecondColumn(); - while (my $line = ) { - chomp($line); - next if ($line !~ /^OG/); - my ($group,$id,$taxon,$length,$domains,$product,$ecs) = split("\t",$line); - next if ($excludeOld && $id =~ /-old\|/); - $proteinIds->{$id}->{group} = $group; - $proteinIds->{$id}->{taxon} = $taxon; - $proteinIds->{$id}->{length} = $length; - $proteinIds->{$id}->{product} = $product; - $ecs = $ecs eq "-" ? "" : $ecs; - $proteinIds->{$id}->{ec} = [split(",",$ecs)]; - $domains = $domains eq "-" ? "" : $domains; - $proteinIds->{$id}->{domain} = [split(",",$domains)]; - } - close IN; - return ($proteinIds,$numTotalProteins,$numTotalGenera); -} - -sub getSecondColumn { - my ($line) = @_; - chomp $line; - my @lineArray = split("\t",$line); - return $lineArray[1]; -} - - -sub getProteinInfo { - my ($group,$excludeOld,$dbh,$proteinInfoFile) = @_; - - my $proteinIds; - my $numTotalProteins; - my $numTotalGenera; - - if ($proteinInfoFile && -e $proteinInfoFile) { - ($proteinIds,$numTotalProteins,$numTotalGenera) = &readProteinInfoFile($proteinInfoFile,$excludeOld); - } else { - ($proteinIds,$numTotalProteins,$numTotalGenera) = &getProteinsFromDatabase($proteinIds,$group,$excludeOld,$dbh); - $proteinIds = &getEcsFromDatabase($proteinIds,$group,$excludeOld,$dbh); - $proteinIds = &getDomainsFromDatabase($proteinIds,$group,$excludeOld,$dbh); - &writeProteinInfoFile($proteinIds,$numTotalProteins,$numTotalGenera,$proteinInfoFile) if ($proteinInfoFile); - } - return ($proteinIds,$numTotalProteins,$numTotalGenera); -} - -sub getProteinsFromDatabase { - my ($proteinIds,$group,$excludeOld,$dbh) = @_; - my $numTotalProteins = 0; - my %genera; - my $query = $dbh->prepare(&proteinsSql($group)); - $query->execute(); - while (my($id,$product,$length,$corePeripheral,$group,$taxon) = $query->fetchrow_array()) { - next if ( ($excludeOld && $id =~ /-old\|/) || $id eq ""); - $numTotalProteins++; - $proteinIds->{$id}->{product} = $product; - $proteinIds->{$id}->{length} = $length; - $proteinIds->{$id}->{corePeripheral} = $corePeripheral; - $proteinIds->{$id}->{group} = $group; - $proteinIds->{$id}->{taxon} = $taxon; - $proteinIds->{$id}->{ec} = []; - $proteinIds->{$id}->{domain} = []; - $genera{&getGenusFromProtein($proteinIds->{$id})} = 1; - } - $query->finish(); - my $numTotalGenera = keys %genera; - return ($proteinIds,$numTotalProteins,$numTotalGenera); -} - -sub createIdString { - my ($proteinIds,$excludeOld) = @_; - my @ids; - foreach my $id (keys %{$proteinIds}) { - next if ($excludeOld && $id =~ /-old\|/); - push @ids, $id; - } - my $idString = join("','",@ids); - return "('".$idString."')"; -} - -sub getEcsFromDatabase { - my ($proteinIds,$group,$excludeOld,$dbh) = @_; - - my $query = $dbh->prepare(&ecsSql($group)); - - $query->execute(); - while (my($id,$ecString) = $query->fetchrow_array()) { - next if ($excludeOld && $id =~ /-old\|/); - die "The protein '$id' was not found in group" if (! exists $proteinIds->{$id}); - $ecString =~ s/ //g; - my @multipleEc = split(/[;,]/,$ecString); - foreach my $ec ( @multipleEc) { - next if (! &validEcNumber($ec)); - push @{$proteinIds->{$id}->{ec}}, $ec; - } - } - $query->finish(); - - # sort EC numbers for each protein - foreach my $id (keys %{$proteinIds}) { - @{$proteinIds->{$id}->{ec}} = sort @{$proteinIds->{$id}->{ec}}; - } - - return $proteinIds; -} - -sub getDomainsFromDatabase { - my ($proteinIds,$group,$excludeOld,$dbh) = @_; - - my $query = $dbh->prepare(&domainsSql($group)); - - $query->execute(); - while (my($id,$domain) = $query->fetchrow_array()) { - next if ($excludeOld && $id =~ /-old\|/); - die "The protein '$id' was not found in group" if (! exists $proteinIds->{$id}); - push @{$proteinIds->{$id}->{domain}}, $domain; - } - $query->finish(); - return $proteinIds; -} - -sub writeDomainCountFile { - my ($domainCount,$domainCountFile) = @_; - - open(OUT,">",$domainCountFile) || die "Cannot open $domainCountFile for writing\n"; - print OUT "DOMAIN\tNUM_PROTEINS\n"; - print OUT "all_proteins\t$domainCount->{numProteins}\n"; - foreach my $domain (keys %{$domainCount->{domain}}) { - print OUT "$domain\t$domainCount->{domain}->{$domain}\n"; - } - close OUT; -} - -sub getGroupFromProteins { - my ($proteinIds,$logFh) = @_; - my $group = ""; - foreach my $protein (keys %{$proteinIds}) { - if ($group ne "" && $group ne $proteinIds->{$protein}->{group}) { - die "expected only one ortholog group but obtained more than one: $group $proteinIds->{$protein}->{group}"; - } - $group = $proteinIds->{$protein}->{group}; - } - if ($group eq "") { - print {$logFh ? $logFh : *STDERR } "Did not obtain any ortholog groups from these proteins: "; - print {$logFh ? $logFh : *STDERR } "$_ " foreach (keys %{$proteinIds}); - die; - } - return $group; -} - -sub getGroupsFromDatabase { - my ($minNumProteinsWithEc,$maxNumProteinsWithEc,$dbh) = @_; - my %groups; - - my $query = $dbh->prepare(&groupsSql($minNumProteinsWithEc,$maxNumProteinsWithEc)); - - $query->execute(); - while (my($group) = $query->fetchrow_array()) { - $groups{$group} = 1; - } - $query->finish(); - return \%groups; -} - -sub domainsSql { - my ($group) = @_; - return "SELECT full_id, accession - FROM webready.ProteinDomainAssignment - WHERE group_name = '$group'"; -} - -sub ecsSql { - my ($group) = @_; - return "SELECT pda.full_id,ec.ec_number - FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda - WHERE ec.enzyme_class_id = aaec.enzyme_class_id - AND aaec.aa_sequence_id = pda.aa_sequence_id - AND pda.group_name = '$group'" -} - -sub proteinsSql { - my ($group) = @_; - return "SELECT full_id,product,length,core_peripheral,group_name,organism_name - FROM webready.ProteinSequenceGroup WHERE group_name='$group'"; -} - -sub groupsSql { - my ($minNumProteinsWithEc,$maxNumProteinsWithEc) = @_; - - return "SELECT DISTINCT(group_name) FROM webready.ProteinSequenceGroup"; -} - -sub orthoOrganismSql { - my ($excludeOld) = @_; - my $whereClause = $excludeOld ? "AND three_letter_abbrev NOT LIKE '%-old'" : ""; - return "WITH RECURSIVE TaxonHierarchy AS ( - SELECT - three_letter_abbrev, - orthomcl_clade_id, - core_peripheral, - three_letter_abbrev AS taxon_group_abbrev - FROM apidb.OrthomclClade - WHERE core_peripheral = 'Z' - - UNION ALL - - SELECT - child.three_letter_abbrev, - child.orthomcl_clade_id, - child.core_peripheral, - parent.taxon_group_abbrev - FROM apidb.OrthomclClade child - JOIN TaxonHierarchy parent - ON child.parent_id = parent.orthomcl_clade_id - ) - SELECT three_letter_abbrev, taxon_group_abbrev - FROM TaxonHierarchy - WHERE core_peripheral IN ('C','P') - $whereClause"; -} - -sub getDbHandle { - - #my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config"; - my ($gusConfigFile) = @_; - my @properties = (); - my $gusconfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1); - - my $u = $gusconfig->{props}->{databaseLogin}; - my $pw = $gusconfig->{props}->{databasePassword}; - my $dsn = $gusconfig->{props}->{dbiDsn}; - - my $dbh = DBI->connect($dsn, $u, $pw) or die DBI::errstr; - $dbh->{RaiseError} = 1; - $dbh->{AutoCommit} = 0; - - return $dbh; -} - -sub makeDir { - my ($dir) = @_; - system("mkdir -p $dir") unless (-e $dir); -} - -sub getCladeToProject { - my %cladeToProject = ( - ALVE => "CryptoDB", - APIC => "CryptoDB", - HAEM => "PlasmoDB", - PIRO => "PiroplasmaDB", - AMOE => "AmoebaDB", - EUGL => "TriTrypDB", - FUNG => "FungiDB", - MICR => "MicrosporidiaDB", - BASI => "FungiDB", - ASCO => "FungiDB", - MUCO => "FungiDB", - CHYT => "FungiDB", - ARTH => "VectorBase", - MAMM => "HostDB", - OOMY => "FungiDB", - ACTI => "none", - ARCH => "none", - VIRI => "none", - CHOR => "none", - CHLO => "none", - CILI => "none", - CREN => "none", - CRYP => "none", - EURY => "none", - FIRM => "none", - KORA => "none", - FIRM => "none", - NEMA => "none", - OBAC => "none", - PROG => "none", - PROA => "none", - PROB => "none", - PROD => "none", - PROE => "none", - RHOD => "none", - STRE => "none", - TUNI => "none" - ); - return \%cladeToProject; -} From 692c42f67d7cc3c8a2f196d7332086262b458fa2 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 4 Mar 2026 13:28:19 -0500 Subject: [PATCH 4/7] Add group/cluster stats and length score to EC assignment output Add cluster_mean_length and length_vs_cluster_mean (fractional deviation from cluster mean) columns, plus group-level counts: group_size, n_annotated_in_group, n_supporting_in_group. Co-Authored-By: Claude Sonnet 4.6 --- Load/bin/assignEcByOrthologs.pl | 37 +++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl index 77dc1313b..a0fa3b527 100755 --- a/Load/bin/assignEcByOrthologs.pl +++ b/Load/bin/assignEcByOrthologs.pl @@ -6,6 +6,7 @@ use lib "$ENV{GUS_HOME}/lib/perl"; use Getopt::Long; +use List::Util qw(sum); use DBI; use GUS::Supported::GusConfig; @@ -175,7 +176,12 @@ n_supporting n_annotated_in_cluster cluster_size + cluster_mean_length + length_vs_cluster_mean cluster_profile + group_size + n_annotated_in_group + n_supporting_in_group )), "\n"; # --- Process each group --- @@ -192,6 +198,21 @@ push @{ $clusters{$key} }, $aaid; } + # Group-level stats (computed once per group, used in every output row). + # These are derived solely from %protein_ec, which is loaded from the DB + # at startup and never modified during processing, so these counts always + # reflect original (pre-transitive) annotations only. + my $group_size = scalar @members; + my $n_annotated_group = scalar grep { $protein_ec{$_} && @{ $protein_ec{$_} } } @members; + my %group_ec_count; # ec => count of proteins in group originally carrying it + for my $aaid (@members) { + my %seen; + for my $ec (@{ $protein_ec{$aaid} // [] }) { + next if $seen{$ec}++; + $group_ec_count{$ec}++; + } + } + # Report which cluster the focus protein landed in if ($verbose && $source_id) { my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @members; @@ -239,6 +260,9 @@ my $cluster_size = scalar @cluster; my $display_profile = $profile_key eq '' ? 'none' : $profile_key; + my @cluster_lengths = map { $protein_info{$_}{protein_length} } @cluster; + my $cluster_mean = sum(@cluster_lengths) / $cluster_size; + # Count support per EC (post-normalization) my %ec_support; for my $aaid (keys %protein_ecs_in_cluster) { @@ -331,7 +355,11 @@ my $score = $pass->{score}; my $is_novel = $existing_ecs{$ec} ? 0 : 1; - printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%s\n", + my $len_vs_mean = $cluster_mean > 0 + ? ($info->{protein_length} - $cluster_mean) / $cluster_mean + : 0; + + printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%d\t%.4f\t%s\t%d\t%d\t%d\n", $gid, $aaid, $info->{source_id}, @@ -342,7 +370,12 @@ $support, $n_annotated, $cluster_size, - $display_profile; + int($cluster_mean + 0.5), + $len_vs_mean, + $display_profile, + $group_size, + $n_annotated_group, + $group_ec_count{$ec} // 0; $rows_written++; } From 356161f6a76e5bc4ed685deeb89b69a607c4a9da Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 4 Mar 2026 13:45:58 -0500 Subject: [PATCH 5/7] Update InsertEcMappingFromOrtho to match new assignEcByOrthologs TSV format - Read aa_sequence_id directly from file (no identifier lookup needed) - Populate new cluster/group stat columns: num_supporting_cluster, num_protein_cluster, num_any_ec_cluster, num_supporting_group, num_protein_group, num_any_ec_group, length_mean - Map confidence_score -> domain_score, length_vs_cluster_mean -> length_score - Load all rows (both novel and already-annotated) - Remove unused imports and aaSeqLocusTagMappingSql argument Co-Authored-By: Claude Sonnet 4.6 --- Load/plugin/perl/InsertEcMappingFromOrtho.pm | 246 ++++++++++--------- 1 file changed, 124 insertions(+), 122 deletions(-) diff --git a/Load/plugin/perl/InsertEcMappingFromOrtho.pm b/Load/plugin/perl/InsertEcMappingFromOrtho.pm index c09698b76..3890a2350 100644 --- a/Load/plugin/perl/InsertEcMappingFromOrtho.pm +++ b/Load/plugin/perl/InsertEcMappingFromOrtho.pm @@ -1,9 +1,8 @@ #^^^^^^^^^^^^^^^^^^^^^^^^^ End GUS4_STATUS ^^^^^^^^^^^^^^^^^^^^ ## InsertEcMappingFromOrtho.pm ## -## Creates new entries in the table DoTS.AASequenceEnzymeClass to represent -## the EC mappings found in a tab delimited file of the form EC number, alias -## $Id$ +## Creates new entries in the table DoTS.AASequenceEnzymeClass from the +## TSV output produced by assignEcByOrthologs.pl ## ####################################################################### @@ -14,88 +13,80 @@ use strict 'vars'; use GUS::PluginMgr::Plugin; use lib "$ENV{GUS_HOME}/lib/perl"; -use FileHandle; -use Carp; use GUS::Model::DoTS::AASequenceEnzymeClass; use GUS::Model::SRes::EnzymeClass; -use GUS::Model::DoTS::TranslatedAAFeature; -use GUS::Model::DoTS::GeneFeature; -#use GUS::Model::DoTS::NAFeatureNaGene; -use GUS::Model::DoTS::NAGene; my $purposeBrief = <$purpose, - purposeBrief=>$purposeBrief, - tablesAffected=>$tablesAffected, - tablesDependedOn=>$tablesDependedOn, - howToRestart=>$howToRestart, - failureCases=>$failureCases, - notes=>$notes - }; - +my $documentation = { purpose => $purpose, + purposeBrief => $purposeBrief, + tablesAffected => $tablesAffected, + tablesDependedOn => $tablesDependedOn, + howToRestart => $howToRestart, + failureCases => $failureCases, + notes => $notes + }; -my $argsDeclaration = +my $argsDeclaration = [ - fileArg({name => 'ECMappingFile', - descr => 'pathname for the file containing the EC mapping data', - constraintFunc => undef, - reqd => 1, - isList => 0, - mustExist => 1, - format => 'Two column tab delimited file in the order EC number, identifier' - }), - stringArg({name => 'evidenceCode', - descr => 'the evidence code with which data should be entered into the AASequenceEnzymeClass table', - reqd => 1, - constraintFunc => undef, - isList => 0, - }), - stringArg({name => 'aaSeqLocusTagMappingSql', - descr => 'sql which returns aa_sequence_id(s) for a given identifier in the EC mapping file. Use a question mark as a macro for where the id should be interpolated into the sql string. Id will most likely be a locus tag', - reqd => 1, - constraintFunc => undef, - isList => 0, - }) - + fileArg({ name => 'ECMappingFile', + descr => 'TSV file produced by assignEcByOrthologs.pl', + constraintFunc => undef, + reqd => 1, + isList => 0, + mustExist => 1, + format => 'Tab-delimited with header row' + }), + stringArg({ name => 'evidenceCode', + descr => 'Evidence code to store in AASequenceEnzymeClass (e.g. OrthoMCLDerived)', + reqd => 1, + constraintFunc => undef, + isList => 0, + }), ]; sub new { my ($class) = @_; my $self = {}; - bless($self,$class); - - $self->initialize({requiredDbVersion => 4.0, - cvsRevision => '$Revision: 24836 $', # cvs fills this in! - name => ref($self), - argsDeclaration => $argsDeclaration, - documentation => $documentation - }); - + bless($self, $class); + + $self->initialize({ requiredDbVersion => 4.0, + cvsRevision => '$Revision$', + name => ref($self), + argsDeclaration => $argsDeclaration, + documentation => $documentation + }); return $self; } @@ -107,87 +98,98 @@ sub run { my ($self) = @_; my $mappingFile = $self->getArg('ECMappingFile'); - my $evidCode = $self->getArg('evidenceCode'); - $self->getAlgInvocation()->setMaximumNumberOfObjects(100000); - - my $queryHandle = $self->getQueryHandle(); - - my %ecNumbers; - my %aaSeqIds; + my $evidCode = $self->getArg('evidenceCode'); - open (ECMAP, "$mappingFile") || die "Can't open the file $mappingFile. Reason: $!\n"; - while (my $line = ) { - chomp($line); - my @row = split('\t',$line); - next if (scalar @row != 9); - my ($id,$ec,$numProteinsWithEc,$numTotalProteins,$numGeneraWithEc,$numTotalGenera,$lengthScore,$blastScore,$domainScore) = @row; - - my $enzymeClass = $self->getEnzymeClass($ec, \%ecNumbers); - if (!$enzymeClass) { - print "This EC does not exist\n"; - next; - } + $self->getAlgInvocation()->setMaximumNumberOfObjects(100000); - my $aaSeqId = $self->getAASeqId($id, $queryHandle); - if (!$aaSeqId) { - print "This aaSeqId does not exist\n"; + my %ecCache; # ec_number => enzyme_class_id + my ($inserted, $skipped_existing, $skipped_no_ec) = (0, 0, 0); + + open(my $FH, '<', $mappingFile) or die "Cannot open '$mappingFile': $!\n"; + + my $header = <$FH>; # skip header line + + while (my $line = <$FH>) { + chomp $line; + my @row = split('\t', $line); + + # Expected columns (0-based): + # 0 group_id + # 1 aa_sequence_id + # 2 source_id + # 3 protein_length + # 4 assigned_ec_number + # 5 is_novel + # 6 confidence_score + # 7 n_supporting + # 8 n_annotated_in_cluster + # 9 cluster_size + # 10 cluster_mean_length + # 11 length_vs_cluster_mean + # 12 cluster_profile + # 13 group_size + # 14 n_annotated_in_group + # 15 n_supporting_in_group + + next unless scalar @row == 16; + + my ($group_id, $aa_sequence_id, $source_id, $protein_length, + $ec_number, $is_novel, $confidence_score, + $n_supporting, $n_annotated_in_cluster, $cluster_size, + $cluster_mean_length, $length_vs_cluster_mean, $cluster_profile, + $group_size, $n_annotated_in_group, $n_supporting_in_group) = @row; + + my $enzyme_class_id = $self->getEnzymeClassId($ec_number, \%ecCache); + unless ($enzyme_class_id) { + $self->log("WARNING: EC number '$ec_number' not found in SRes.EnzymeClass — skipping $source_id"); + $skipped_no_ec++; next; } - $self->log("Processing Pfid: $id, ECNumber: $ec"); - - my $newAASeqEnzClass = GUS::Model::DoTS::AASequenceEnzymeClass->new({ - 'aa_sequence_id' => $aaSeqId, - 'enzyme_class_id' => $enzymeClass, - 'evidence_code' => $evidCode, - 'num_protein_with_ec' => $numProteinsWithEc, - 'num_protein_in_group' => $numTotalProteins, - 'num_genera_with_ec' => $numGeneraWithEc, - 'num_genera_in_group' => $numTotalGenera, - 'length_score' => $lengthScore, - 'blast_score' => $blastScore, - 'domain_score' => $domainScore + my $newRow = GUS::Model::DoTS::AASequenceEnzymeClass->new({ + aa_sequence_id => $aa_sequence_id, + enzyme_class_id => $enzyme_class_id, + evidence_code => $evidCode, + domain_score => $confidence_score, + length_score => $length_vs_cluster_mean, + length_mean => $cluster_mean_length, + num_supporting_cluster => $n_supporting, + num_protein_cluster => $cluster_size, + num_any_ec_cluster => $n_annotated_in_cluster, + num_supporting_group => $n_supporting_in_group, + num_protein_group => $group_size, + num_any_ec_group => $n_annotated_in_group, }); - if (! $newAASeqEnzClass->retrieveFromDB()) { - $newAASeqEnzClass->submit(); - $self->log(" submitted enzyme $enzymeClass, seq $aaSeqId"); + + if ($newRow->retrieveFromDB()) { + $skipped_existing++; + } else { + $newRow->submit(); + $self->log("Inserted EC=$ec_number for $source_id (aa_sequence_id=$aa_sequence_id, group=$group_id)"); + $inserted++; } - $self->undefPointerCache(); - } - return "Finished processing EC Mapping file\n"; -} -###### FETCH THE EC ID FOR A GIVEN EC NUMBER ###### -sub getEnzymeClass { - my ($self, $ecNumber, $ecHash) = @_; - if (! exists $ecHash->{$ecNumber}) { - my $newEnzymeClass = GUS::Model::SRes::EnzymeClass->new({ - 'ec_number' => $ecNumber - }); - $newEnzymeClass->retrieveFromDB(); - $ecHash->{$ecNumber} = $newEnzymeClass->getId(); + $self->undefPointerCache(); } - return $ecHash->{$ecNumber}; + + close($FH); + + return "Inserted $inserted rows. Skipped: $skipped_existing already in DB, $skipped_no_ec unknown EC.\n"; } -###### FETCH THE AA SEQUNCE ID FOR A GIVEN ALIAS ###### -sub getAASeqId { - my ($self, $id, $dbh) = @_; - my $sql = "SELECT aa_sequence_id from dots.externalaasequence where source_id = '$id'"; - my $sth = $dbh->prepare($sql); - $sth->execute(); - my $aaSeqId; - while (my $aaSequenceId = $sth->fetchrow_array()) { - $aaSeqId = $aaSequenceId; +sub getEnzymeClassId { + my ($self, $ecNumber, $cache) = @_; + unless (exists $cache->{$ecNumber}) { + my $ec = GUS::Model::SRes::EnzymeClass->new({ ec_number => $ecNumber }); + $ec->retrieveFromDB(); + $cache->{$ecNumber} = $ec->getId(); } - $sth->finish(); - return $aaSeqId; + return $cache->{$ecNumber}; } sub undoTables { - my ($self) = @_; - - return ('DoTS.AASequenceEnzymeClass'); + my ($self) = @_; + return ('DoTS.AASequenceEnzymeClass'); } 1; From c4e4e1b6c71b83a66dc36765e03b0e4b5683fd92 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 4 Mar 2026 14:00:53 -0500 Subject: [PATCH 6/7] Ensure correct numeric types when loading EC mapping data Round domain_score and length_score to 4 decimal places, cast length_mean to integer, and explicitly int() all count columns. Co-Authored-By: Claude Sonnet 4.6 --- Load/plugin/perl/InsertEcMappingFromOrtho.pm | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Load/plugin/perl/InsertEcMappingFromOrtho.pm b/Load/plugin/perl/InsertEcMappingFromOrtho.pm index 3890a2350..0b9b52963 100644 --- a/Load/plugin/perl/InsertEcMappingFromOrtho.pm +++ b/Load/plugin/perl/InsertEcMappingFromOrtho.pm @@ -150,15 +150,15 @@ sub run { aa_sequence_id => $aa_sequence_id, enzyme_class_id => $enzyme_class_id, evidence_code => $evidCode, - domain_score => $confidence_score, - length_score => $length_vs_cluster_mean, - length_mean => $cluster_mean_length, - num_supporting_cluster => $n_supporting, - num_protein_cluster => $cluster_size, - num_any_ec_cluster => $n_annotated_in_cluster, - num_supporting_group => $n_supporting_in_group, - num_protein_group => $group_size, - num_any_ec_group => $n_annotated_in_group, + domain_score => sprintf("%.4f", $confidence_score), + length_score => sprintf("%.4f", $length_vs_cluster_mean), + length_mean => int($cluster_mean_length + 0.5), + num_supporting_cluster => int($n_supporting), + num_protein_cluster => int($cluster_size), + num_any_ec_cluster => int($n_annotated_in_cluster), + num_supporting_group => int($n_supporting_in_group), + num_protein_group => int($group_size), + num_any_ec_group => int($n_annotated_in_group), }); if ($newRow->retrieveFromDB()) { From eb051812dddf5217e947c7c93dbd99cdf97423d0 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 11 Mar 2026 14:18:06 -0400 Subject: [PATCH 7/7] Skip single-gene clusters and already-annotated EC assignments Only output rows for novel EC assignments (genes missing the EC) and only process interpro clusters with more than one gene. Co-Authored-By: Claude Sonnet 4.6 --- Load/bin/assignEcByOrthologs.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl index a0fa3b527..80a2cb76e 100755 --- a/Load/bin/assignEcByOrthologs.pl +++ b/Load/bin/assignEcByOrthologs.pl @@ -258,6 +258,7 @@ next if $n_annotated == 0; # nothing to propagate from my $cluster_size = scalar @cluster; + next if $cluster_size == 1; my $display_profile = $profile_key eq '' ? 'none' : $profile_key; my @cluster_lengths = map { $protein_info{$_}{protein_length} } @cluster; @@ -354,6 +355,7 @@ my $support = $pass->{support}; my $score = $pass->{score}; my $is_novel = $existing_ecs{$ec} ? 0 : 1; + next unless $is_novel; my $len_vs_mean = $cluster_mean > 0 ? ($info->{protein_length} - $cluster_mean) / $cluster_mean