From 9c9e516899ebb50868014d7b5a8a6266172f9487 Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Tue, 3 Mar 2026 23:25:17 -0500
Subject: [PATCH 1/7] Add assignEcByOrthologs.pl: transitive EC assignment via
 ortholog groups

Uses group co-membership and shared InterPro family profiles to assign
EC numbers to unannotated proteins by majority vote within profile clusters.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/bin/assignEcByOrthologs.pl | 302 ++++++++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100755 Load/bin/assignEcByOrthologs.pl
diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl
new file mode 100755
index 000000000..bbbc118de
--- /dev/null
+++ b/Load/bin/assignEcByOrthologs.pl
@@ -0,0 +1,302 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use lib "$ENV{GUS_HOME}/lib/perl";
+
+use Getopt::Long;
+use DBI;
+use GUS::Supported::GusConfig;
+
+my ($gusConfigFile, $output, $threshold, $group_id, $verbose);
+$threshold = 0.60;
+
+&GetOptions(
+    "gusConfigFile=s" => \$gusConfigFile,
+    "output=s"        => \$output,
+    "threshold=f"     => \$threshold,
+    "group_id=s"      => \$group_id,
+    "verbose"         => \$verbose,
+);
+
+die "Usage: assignEcByOrthologs.pl --output <file> [--gusConfigFile <path>] [--threshold 0.60] [--group_id <id>] [--verbose]\n"
+    unless $output;
+
+$gusConfigFile ||= "$ENV{GUS_HOME}/config/gus.config";
+
+# --- DB connection ---
+my $gusconfig = GUS::Supported::GusConfig->new($gusConfigFile);
+my $dbh = DBI->connect(
+    $gusconfig->getDbiDsn(),
+    $gusconfig->getDatabaseLogin(),
+    $gusconfig->getDatabasePassword(),
+    { RaiseError => 1, AutoCommit => 0 }
+) or die DBI::errstr;
+
+# --- Load group members ---
+print STDERR "Loading group members...\n" if $verbose;
+
+my %group_members;   # group_id => [ aa_sequence_id, ... ]
+my %protein_info;    # aa_sequence_id => { source_id, protein_length }
+
+{
+    my $sql = q{
+        SELECT og.group_id,
+               og.aa_sequence_id,
+               s.source_id,
+               s.length AS protein_length
+        FROM   apidb.orthologgroupaasequence og
+        JOIN   dots.aasequence s ON s.aa_sequence_id = og.aa_sequence_id
+    };
+    $sql .= " WHERE og.group_id = ?" if $group_id;
+
+    my $sth = $dbh->prepare($sql);
+    $group_id ? $sth->execute($group_id) : $sth->execute();
+
+    while (my ($gid, $aaid, $src, $len) = $sth->fetchrow_array()) {
+        push @{ $group_members{$gid} }, $aaid;
+        $protein_info{$aaid} = { source_id => $src, protein_length => $len // 0 };
+    }
+    $sth->finish();
+}
+print STDERR "Loaded ", scalar(keys %group_members), " groups, ",
+             scalar(keys %protein_info), " proteins\n" if $verbose;
+
+# --- Load EC numbers (exclude OrthoMCLDerived) ---
+print STDERR "Loading EC annotations...\n" if $verbose;
+
+my %protein_ec;   # aa_sequence_id => [ ec_number, ... ]
+
+{
+    my $sql = q{
+        SELECT ae.aa_sequence_id,
+               ec.ec_number
+        FROM   dots.aasequenceenzymeclass ae
+        JOIN   sres.enzymeclass ec ON ec.enzyme_class_id = ae.enzyme_class_id
+        WHERE  (ae.evidence_code IS NULL OR ae.evidence_code != 'OrthoMCLDerived')
+    };
+
+    if ($group_id) {
+        $sql = q{
+            SELECT ae.aa_sequence_id,
+                   ec.ec_number
+            FROM   dots.aasequenceenzymeclass ae
+            JOIN   sres.enzymeclass ec ON ec.enzyme_class_id = ae.enzyme_class_id
+            JOIN   apidb.orthologgroupaasequence og ON og.aa_sequence_id = ae.aa_sequence_id
+            WHERE  (ae.evidence_code IS NULL OR ae.evidence_code != 'OrthoMCLDerived')
+              AND  og.group_id = ?
+        };
+    }
+
+    my $sth = $dbh->prepare($sql);
+    $group_id ? $sth->execute($group_id) : $sth->execute();
+
+    while (my ($aaid, $ec) = $sth->fetchrow_array()) {
+        push @{ $protein_ec{$aaid} }, $ec if exists $protein_info{$aaid};
+    }
+    $sth->finish();
+}
+my $ec_count = scalar grep { @{ $protein_ec{$_} } } keys %protein_ec;
+print STDERR "Loaded EC annotations for $ec_count proteins\n" if $verbose;
+
+# --- Load InterPro family IDs ---
+print STDERR "Loading InterPro family profiles...\n" if $verbose;
+
+my %protein_profile;   # aa_sequence_id => "IPR...|IPR..." (sorted, pipe-joined)
+
+{
+    my %ipr_sets;   # aa_sequence_id => { ipr_id => 1 }
+
+    my $sql = q{
+        SELECT ir.protein_source_id,
+               ir.interpro_family_id
+        FROM   apidb.interproresults ir
+        WHERE  ir.interpro_family_id IS NOT NULL
+          AND  ir.interpro_family_id != '-'
+    };
+
+    if ($group_id) {
+        $sql = q{
+            SELECT ir.protein_source_id,
+                   ir.interpro_family_id
+            FROM   apidb.interproresults ir
+            JOIN   dots.aasequence s ON s.source_id = ir.protein_source_id
+            JOIN   apidb.orthologgroupaasequence og ON og.aa_sequence_id = s.aa_sequence_id
+            WHERE  ir.interpro_family_id IS NOT NULL
+              AND  ir.interpro_family_id != '-'
+              AND  og.group_id = ?
+        };
+    }
+
+    my $sth = $dbh->prepare($sql);
+    $group_id ? $sth->execute($group_id) : $sth->execute();
+
+    my %src_to_aaid;
+    for my $aaid (keys %protein_info) {
+        $src_to_aaid{ $protein_info{$aaid}{source_id} } = $aaid;
+    }
+
+    while (my ($src, $ipr) = $sth->fetchrow_array()) {
+        my $aaid = $src_to_aaid{$src};
+        next unless defined $aaid;
+        $ipr_sets{$aaid}{$ipr} = 1;
+    }
+    $sth->finish();
+
+    for my $aaid (keys %ipr_sets) {
+        $protein_profile{$aaid} = join('|', sort keys %{ $ipr_sets{$aaid} });
+    }
+}
+print STDERR "Loaded InterPro profiles for ", scalar(keys %protein_profile), " proteins\n"
+    if $verbose;
+
+$dbh->disconnect();
+
+# --- Open output ---
+open(my $OUT, '>', $output) or die "Cannot open output '$output': $!\n";
+
+print $OUT join("\t", qw(
+    group_id
+    aa_sequence_id
+    source_id
+    protein_length
+    assigned_ec_number
+    is_novel
+    confidence_score
+    n_supporting
+    n_annotated_in_cluster
+    cluster_size
+    cluster_profile
+)), "\n";
+
+# --- Process each group ---
+my $groups_processed = 0;
+my $rows_written = 0;
+
+for my $gid (sort keys %group_members) {
+    my @members = @{ $group_members{$gid} };
+
+    # Cluster by profile key
+    my %clusters;   # profile_key => [ aa_sequence_id, ... ]
+    for my $aaid (@members) {
+        my $key = $protein_profile{$aaid} // '';
+        push @{ $clusters{$key} }, $aaid;
+    }
+
+    for my $profile_key (sort keys %clusters) {
+        my @cluster = @{ $clusters{$profile_key} };
+
+        # Collect all ECs from annotated proteins in this cluster
+        my %protein_ecs_in_cluster;   # aaid => [ec, ...]
+        my @all_cluster_ecs;
+
+        for my $aaid (@cluster) {
+            if ($protein_ec{$aaid} && @{ $protein_ec{$aaid} }) {
+                $protein_ecs_in_cluster{$aaid} = $protein_ec{$aaid};
+                push @all_cluster_ecs, @{ $protein_ec{$aaid} };
+            }
+        }
+
+        my $n_annotated = scalar keys %protein_ecs_in_cluster;
+        next if $n_annotated == 0;   # nothing to propagate from
+
+        # Count support per EC (post-normalization)
+        my %ec_support;
+        for my $aaid (keys %protein_ecs_in_cluster) {
+            my @p_ecs = normalize_ecs(@{ $protein_ecs_in_cluster{$aaid} });
+            my %seen;
+            for my $ec (@p_ecs) {
+                next if $seen{$ec}++;
+                $ec_support{$ec}++;
+            }
+        }
+
+        # Majority vote
+        my @passing_ecs;
+        for my $ec (sort keys %ec_support) {
+            my $support = $ec_support{$ec};
+            my $score   = $support / $n_annotated;
+            if ($score >= $threshold) {
+                push @passing_ecs, { ec => $ec, support => $support, score => $score };
+            }
+        }
+        next unless @passing_ecs;
+
+        my $cluster_size    = scalar @cluster;
+        my $display_profile = $profile_key eq '' ? 'none' : $profile_key;
+
+        for my $aaid (@cluster) {
+            my $info = $protein_info{$aaid};
+            my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$aaid} // [] };
+
+            for my $pass (@passing_ecs) {
+                my $ec      = $pass->{ec};
+                my $support = $pass->{support};
+                my $score   = $pass->{score};
+                my $is_novel = $existing_ecs{$ec} ? 0 : 1;
+
+                printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%s\n",
+                    $gid,
+                    $aaid,
+                    $info->{source_id},
+                    $info->{protein_length},
+                    $ec,
+                    $is_novel,
+                    $score,
+                    $support,
+                    $n_annotated,
+                    $cluster_size,
+                    $display_profile;
+
+                $rows_written++;
+            }
+        }
+    }
+
+    $groups_processed++;
+    if ($verbose && $groups_processed % 10000 == 0) {
+        print STDERR "Processed $groups_processed groups, $rows_written rows written\n";
+    }
+}
+
+close($OUT);
+
+print STDERR "Done. Processed $groups_processed groups, wrote $rows_written rows to $output\n";
+
+# ===========================================================================
+# Helpers
+# ===========================================================================
+
+sub normalize_ecs {
+    my @ecs = @_;
+    my @result;
+    for my $ec (@ecs) {
+        my $subsumed = 0;
+        for my $other (@ecs) {
+            next if $other eq $ec;
+            if (is_more_specific($other, $ec)) {
+                $subsumed = 1;
+                last;
+            }
+        }
+        push @result, $ec unless $subsumed;
+    }
+    return @result;
+}
+
+sub is_more_specific {
+    my ($specific, $general) = @_;
+    my @s = split /\./, $specific;
+    my @g = split /\./, $general;
+    return 0 unless @g == 4 && @s == 4;
+    for my $i (0 .. 3) {
+        if ($g[$i] eq '-') {
+            # general is a wildcard here; specific is more specific only if it has a defined value
+            return ($s[$i] ne '-') ? 1 : 0;
+        }
+        return 0 if $s[$i] ne $g[$i];   # mismatch at a defined position
+    }
+    # All 4 positions matched exactly — same specificity, not strictly more specific
+    return 0;
+}

From 4cccb4ba2ae938e3f45a99ccdd5e391176bcef12 Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 4 Mar 2026 00:48:19 -0500
Subject: [PATCH 2/7] Add verbose debug output for skipped EC assignments

When --verbose is on, print the reason each EC in a group was not
transitively assigned: either subsumed by a more specific EC during
hierarchy normalization, or below the majority-vote threshold.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/bin/assignEcByOrthologs.pl | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl
index bbbc118de..5c64c6b04 100755
--- a/Load/bin/assignEcByOrthologs.pl
+++ b/Load/bin/assignEcByOrthologs.pl
@@ -201,6 +201,9 @@
         my $n_annotated = scalar keys %protein_ecs_in_cluster;
         next if $n_annotated == 0;   # nothing to propagate from
 
+        my $cluster_size    = scalar @cluster;
+        my $display_profile = $profile_key eq '' ? 'none' : $profile_key;
+
         # Count support per EC (post-normalization)
         my %ec_support;
         for my $aaid (keys %protein_ecs_in_cluster) {
@@ -212,6 +215,16 @@
             }
         }
 
+        if ($verbose) {
+            # Report any raw ECs that were dropped by hierarchy normalization
+            my %raw_unique = map { $_ => 1 } @all_cluster_ecs;
+            for my $ec (sort keys %raw_unique) {
+                unless (exists $ec_support{$ec}) {
+                    print STDERR "  SKIP $gid  profile=$display_profile  EC=$ec: subsumed by more specific EC in cluster\n";
+                }
+            }
+        }
+
         # Majority vote
         my @passing_ecs;
         for my $ec (sort keys %ec_support) {
@@ -219,13 +232,13 @@
             my $score   = $support / $n_annotated;
             if ($score >= $threshold) {
                 push @passing_ecs, { ec => $ec, support => $support, score => $score };
+            } elsif ($verbose) {
+                printf STDERR "  SKIP $gid  profile=$display_profile  EC=$ec: score too low (%d/%d = %.2f < %.2f)\n",
+                    $support, $n_annotated, $score, $threshold;
             }
         }
         next unless @passing_ecs;
 
-        my $cluster_size    = scalar @cluster;
-        my $display_profile = $profile_key eq '' ? 'none' : $profile_key;
-
         for my $aaid (@cluster) {
             my $info = $protein_info{$aaid};
             my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$aaid} // [] };

From c24246cbc8111500112b407ddc009640c6de60fe Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 4 Mar 2026 12:18:34 -0500
Subject: [PATCH 3/7] add some extra logging per source_id for debug purposes. 
 remove now unused scripots

---
 Load/bin/assignEcByOrthologs.pl |   86 +-
 Load/bin/newEcPrediction.pl     |  202 -----
 Load/bin/orthomclEcPrediction   | 1478 -------------------------------
 3 files changed, 84 insertions(+), 1682 deletions(-)
 delete mode 100644 Load/bin/newEcPrediction.pl
 delete mode 100644 Load/bin/orthomclEcPrediction

diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl
index 5c64c6b04..77dc1313b 100755
--- a/Load/bin/assignEcByOrthologs.pl
+++ b/Load/bin/assignEcByOrthologs.pl
@@ -9,7 +9,7 @@
 use DBI;
 use GUS::Supported::GusConfig;
 
-my ($gusConfigFile, $output, $threshold, $group_id, $verbose);
+my ($gusConfigFile, $output, $threshold, $group_id, $source_id, $verbose);
 $threshold = 0.60;
 
 &GetOptions(
@@ -17,10 +17,11 @@
     "output=s"        => \$output,
     "threshold=f"     => \$threshold,
     "group_id=s"      => \$group_id,
+    "source_id=s"     => \$source_id,
     "verbose"         => \$verbose,
 );
 
-die "Usage: assignEcByOrthologs.pl --output <file> [--gusConfigFile <path>] [--threshold 0.60] [--group_id <id>] [--verbose]\n"
+die "Usage: assignEcByOrthologs.pl --output <file> [--gusConfigFile <path>] [--threshold 0.60] [--group_id <id>] [--source_id <source_id>] [--verbose]\n"
     unless $output;
 
 $gusConfigFile ||= "$ENV{GUS_HOME}/config/gus.config";
@@ -153,6 +154,13 @@
 
 $dbh->disconnect();
 
+if ($source_id) {
+    my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } keys %protein_info;
+    die "ERROR: source_id '$source_id' not found in loaded data. Check the ID or --group_id filter.\n"
+        unless defined $focus_aaid;
+    print STDERR "Focusing on source_id=$source_id  (aa_sequence_id=$focus_aaid)\n" if $verbose;
+}
+
 # --- Open output ---
 open(my $OUT, '>', $output) or die "Cannot open output '$output': $!\n";
 
@@ -184,6 +192,25 @@
         push @{ $clusters{$key} }, $aaid;
     }
 
+    # Report which cluster the focus protein landed in
+    if ($verbose && $source_id) {
+        my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @members;
+        if (defined $focus_aaid) {
+            my $profile = $protein_profile{$focus_aaid} // '';
+            my $display = $profile eq '' ? 'none (no InterPro family hits)' : $profile;
+            print STDERR "\n  PROTEIN $source_id  (aa_sequence_id=$focus_aaid)\n";
+            print STDERR "    group=$gid\n";
+            print STDERR "    InterPro profile: $display\n";
+            my $own_ecs = @{ $protein_ec{$focus_aaid} // [] }
+                ? join(', ', @{ $protein_ec{$focus_aaid} })
+                : '(none)';
+            print STDERR "    existing ECs on this protein: $own_ecs\n";
+            my $cluster_key = $protein_profile{$focus_aaid} // '';
+            my $cluster_size = scalar @{ $clusters{$cluster_key} };
+            print STDERR "    cluster_size (same profile): $cluster_size\n\n";
+        }
+    }
+
     for my $profile_key (sort keys %clusters) {
         my @cluster = @{ $clusters{$profile_key} };
 
@@ -199,6 +226,14 @@
         }
 
         my $n_annotated = scalar keys %protein_ecs_in_cluster;
+
+        if ($n_annotated == 0 && $verbose && $source_id) {
+            my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @cluster;
+            if (defined $focus_aaid) {
+                my $display = $profile_key eq '' ? 'none' : $profile_key;
+                print STDERR "  PROTEIN $source_id: cluster (profile=$display) has no annotated proteins — nothing to propagate\n\n";
+            }
+        }
         next if $n_annotated == 0;   # nothing to propagate from
 
         my $cluster_size    = scalar @cluster;
@@ -237,6 +272,53 @@
                     $support, $n_annotated, $score, $threshold;
             }
         }
+        # If we are focusing on a specific protein, print its cluster context
+        # regardless of whether any ECs pass threshold
+        if ($verbose && $source_id) {
+            my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @cluster;
+            if (defined $focus_aaid) {
+                print STDERR "\n  PROTEIN $source_id  (aa_sequence_id=$focus_aaid)\n";
+                print STDERR "    group=$gid  cluster_size=$cluster_size  n_annotated=$n_annotated\n";
+                print STDERR "    profile=$display_profile\n";
+                my $own_ecs = @{ $protein_ec{$focus_aaid} // [] }
+                    ? join(', ', @{ $protein_ec{$focus_aaid} })
+                    : '(none)';
+                print STDERR "    existing ECs on this protein: $own_ecs\n";
+                if ($n_annotated > 0) {
+                    print STDERR "    cluster members with ECs:\n";
+                    for my $other (sort keys %protein_ecs_in_cluster) {
+                        my $osrc = $protein_info{$other}{source_id};
+                        my @norm = normalize_ecs(@{ $protein_ecs_in_cluster{$other} });
+                        printf STDERR "      %s  raw=[%s]  normalized=[%s]\n",
+                            $osrc,
+                            join(', ', @{ $protein_ecs_in_cluster{$other} }),
+                            join(', ', @norm);
+                    }
+                    print STDERR "    EC vote results:\n";
+                    for my $ec (sort keys %ec_support) {
+                        my $score = $ec_support{$ec} / $n_annotated;
+                        my $verdict = $score >= $threshold ? 'PASS' : 'fail';
+                        printf STDERR "      EC=%s  %d/%d = %.2f  %s\n",
+                            $ec, $ec_support{$ec}, $n_annotated, $score, $verdict;
+                    }
+                } else {
+                    print STDERR "    no annotated proteins in this cluster — nothing to propagate\n";
+                }
+                if (@passing_ecs) {
+                    print STDERR "    ECs to be assigned:\n";
+                    my %existing_ecs = map { $_ => 1 } @{ $protein_ec{$focus_aaid} // [] };
+                    for my $pass (@passing_ecs) {
+                        my $novel = $existing_ecs{$pass->{ec}} ? 'already annotated' : 'NOVEL';
+                        printf STDERR "      EC=%s  score=%.2f (%d/%d)  %s\n",
+                            $pass->{ec}, $pass->{score}, $pass->{support}, $n_annotated, $novel;
+                    }
+                } else {
+                    print STDERR "    no ECs passed threshold — nothing assigned\n";
+                }
+                print STDERR "\n";
+            }
+        }
+
         next unless @passing_ecs;
 
         for my $aaid (@cluster) {
diff --git a/Load/bin/newEcPrediction.pl b/Load/bin/newEcPrediction.pl
deleted file mode 100644
index 71aebd913..000000000
--- a/Load/bin/newEcPrediction.pl
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-use Data::Dumper;
-use lib "$ENV{GUS_HOME}/lib/perl";
-use GUS::Supported::GusConfig;
-use GUS::ObjRelP::DbiDatabase;
-use CBIL::Util::PropertySet;
-use DBI;
-
-my $outputFile = $ARGV[0];
-my $gusConfigFile = $ARGV[1];
-
-$gusConfigFile = "$ENV{GUS_HOME}/config/gus.config" unless ($gusConfigFile);
-my $gusconfig = GUS::Supported::GusConfig->new($gusConfigFile);
-
-my $verbose;
-my $db = GUS::ObjRelP::DbiDatabase->new($gusconfig->getDbiDsn(),
-                                        $gusconfig->getDatabaseLogin(),
-                                        $gusconfig->getDatabasePassword(),
-                                        $verbose,0,1,
-                                        $gusconfig->getCoreSchemaName()
-                                       );
-
-my $dbh = $db->getQueryHandle();
-
-open (OUT, ">$outputFile") || die "can not open $outputFile to write\n";
-
-# Get all groups
-my @groups = @{ &getGroups($dbh) };
-
-my $counter = 0;
-
-# For every group
-foreach my $groupId (@groups) {
-    
-    $counter += 1;
-
-    if ($counter % 1000 == 0) {
-        print "$counter groups processed...\n";
-    }
-
-    # Get all proteins from that group with an EC assignment
-    my %proteinToEc = %{ &getProteinToEC($dbh,$groupId) };
-
-    # Get all proteins that have a significant hit to the group centroid. Note, this is stored as the absolute log of the evalue.
-    my %proteinToEValue = %{ &getProteinToEValue($dbh,$groupId) };
-
-    my %ecBestEvalue;
-    my %processedProteins;
-
-    # For every EC assignment in this group, record the best blast evalue to the centroid
-    # For each protein  
-    foreach my $protein (keys %proteinToEc) {
-        # This protein gets a score of 4, as it is assigned this EC
-        print OUT "$protein\t$proteinToEc{$protein}\t4\n";
-        # If we already have an protein with this EC, and it has a significant blast to the centroid
-        if ($ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) {
-            if ($ecBestEvalue{$proteinToEc{$protein}} < $proteinToEValue{$protein}) {
-                $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein};
-            }
-        }
-        # Another protein assigned this EC, but doesn't have a significant blast hit with the centroid
-        elsif ($ecBestEvalue{$proteinToEc{$protein}} && !$proteinToEValue{$protein}) {
-            next;
-        }
-        # First protein with this EC, and it has a significant blast value to the centroid
-        elsif (!$ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) {
-            $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein};
-        }
-        # First protein with this EC, and it does not have a significant blast value to the centroid
-        else {
-            $ecBestEvalue{$proteinToEc{$protein}} = 5;
-        }
-        # Keeps track of which proteins we have already scored (4 as they have this EC assigned)
-	$processedProteins{$protein} = 1;
-    }
-
-    # For every protein in the group that has a significant blast hit to the centroid, we assign a score (if it had an EC assignment, it already got a 4)
-    foreach my $protein (keys %proteinToEValue) {
-        # We have already assigned this a 4
-        next if ($processedProteins{$protein});
-        # It already cannot get a score of 1
-        next if (!$proteinToEValue{$protein});
-        next if ($proteinToEValue{$protein} < 10);
-        # For every EC
-        foreach my $ec (keys %ecBestEvalue) {        
-            # If the protein to centroid blast was > 100
-            if ($proteinToEValue{$protein} > 100) {
-                # And the best blast to the centroid for all proteins with this EC was > 100
-                if ($ecBestEvalue{$ec} > 100) {
-                    # This protein EC combo scores a 3
-                    print OUT "$protein\t$ec\t3\n";
-                }
-                # If the the best blast to the centroid for all proteins with this EC was > 50
-                elsif ($ecBestEvalue{$ec} > 50) {
-                    # This protein EC combo scores a 2
-                    print OUT "$protein\t$ec\t2\n";
-                }
-                # If the the best blast to the centroid for all proteins with this EC was > 10
-                elsif ($ecBestEvalue{$ec} >= 10) {
-                    # This protein EC combo scores a 1
-                    print OUT "$protein\t$ec\t1\n";
-                }
-            }
-            # Same process for this but this protein EC combo can only score a max of a 2
-            elsif ($proteinToEValue{$protein} > 50) {
-                if ($ecBestEvalue{$ec} > 50) {
-                    print OUT "$protein\t$ec\t2\n";
-                }
-                elsif ($ecBestEvalue{$ec} >= 10) {
-                    print OUT "$protein\t$ec\t1\n";
-                }
-            }
-            # Same process for this but this protein EC combo can only score a max of a 1
-            elsif ($proteinToEValue{$protein} >= 10) {
-                if ($ecBestEvalue{$ec} >= 10) {
-                    print OUT "$protein\t$ec\t1\n";
-                }
-            }
-        }
-    }
-}
-
-close OUT;
-
-sub getGroups {
-    my ($dbh) = @_;
-    my $sql = "SELECT DISTINCT(group_name) FROM webready.ProteinSequence_pgroup";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-    while(my ($groupId) = $sh->fetchrow_array()) {
-        push(@groups,$groupId);
-    }
-    $sh->finish();
-    return \@groups;
-}
-
-sub getProteinToEValue {
-    my ($dbh,$groupId) = @_;
-    my $sql = "SELECT qseq,evalue FROM apidb.orthogroupblastvalue
-               WHERE group_id = '$groupId'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my %proteinToEValue;
-    while(my ($protein,$evalue) = $sh->fetchrow_array()) {
-        if ($evalue eq '0.0') {
-            $evalue = 1.0e-300;
-        }
-	$proteinToEValue{$protein} = abs_log10($evalue);
-    }
-    $sh->finish();
-
-    return \%proteinToEValue;
-}
-
-sub getProteinToDomain {
-    my ($dbh,$groupId) = @_;
-    my $sql = "SELECT full_id,accession FROM webready.ProteinDomainAssignment
-               WHERE group_name = '$groupId'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my %proteinToDomain;
-    while(my ($protein,$domain) = $sh->fetchrow_array()) {
-	$proteinToDomain{$protein} = $domain;
-    }
-    $sh->finish();
-
-    return \%proteinToDomain;
-}
-
-sub getProteinToEC {
-    my ($dbh,$groupId) = @_;
-    my $sql = "SELECT pda.full_id,ec.ec_number
-               FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda
-               WHERE ec.enzyme_class_id = aaec.enzyme_class_id 
-               AND aaec.aa_sequence_id = pda.aa_sequence_id
-               AND pda.group_name = '$groupId'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my %proteinToEC;
-    while(my ($protein,$ec) = $sh->fetchrow_array()) {
-	$proteinToEC{$protein} = $ec;
-    }
-    $sh->finish();
-
-    return \%proteinToEC;
-}
-
-sub abs_log10 {
-    my ($num) = @_;
-
-    # Handle zero or negative input
-    return undef if $num <= 0;
-
-    # Take the log base 10 and return absolute value, rounded
-    return int(abs(log($num) / log(10)) + 0.5);  # round to nearest int
-}
diff --git a/Load/bin/orthomclEcPrediction b/Load/bin/orthomclEcPrediction
deleted file mode 100644
index b8971573e..000000000
--- a/Load/bin/orthomclEcPrediction
+++ /dev/null
@@ -1,1478 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-use Data::Dumper;
-use lib "$ENV{GUS_HOME}/lib/perl";
-use CBIL::Util::PropertySet;
-use DBI;
-
-my $outputDir = $ARGV[0];
-my $outputFile = $ARGV[1];
-my $gusConfigFile = $ARGV[2];
-
-my $minNumProteinsWithEc = 1;    # if not testing, then set to 1
-my $maxNumProteinsWithEc = 0;    # if not testing, then set to 0
-my $minNumGeneraForViableEc = 1;
-my $minNumProteinsForViableEc = 1;
-my $excludeOld = 1;
-my $createStatsFile = 0;       # if not testing, then set to 0
-my $createProteinFile = 0;     # if not testing, then set to 0
-my $createLogFile = 0;         # if not testing, then set to 0 and the log will go to STDERR
-# If this is ever run, it will fail
-my $test = 0;                  # if not testing, then set to 0
-my $fractionOfGroups = 1;      # if not testing, then set to 1
-
-my ($testFraction,$totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) = &setUpTest($outputDir) if ($test);
-
-my ($logFh,$netGroupsRef,$dbh,$groups) = &setUpPrediction($outputDir,$fractionOfGroups,$minNumProteinsWithEc,$maxNumProteinsWithEc,$createLogFile, $gusConfigFile);
-
-if (!$test) {
-    open(my $outFh, ">", $outputFile) || die "Cannot open file '$outputFile' for writing";
-    close $outFh;
-}
-
-foreach my $group (keys %{$groups}) {
-    next if (&nextGroup($fractionOfGroups,$netGroupsRef,$logFh));
-
-    my ($statsFh,$proteinFile) = &makeGroupDirAndFiles($outputDir,$group,$createStatsFile,$createProteinFile);
-    my ($proteinIds,$numTotalProteins,$numTotalGenera) = &getProteinInfo($group,$excludeOld,$dbh,$proteinFile);
-
-    my ($trainingIds,$testIds);
-    ($trainingIds,$testIds) = &getTrainingAndTestIds($proteinIds,$testFraction) if ($test);
-    next if ($test && ($trainingIds eq "" || $testIds eq ""));
-
-    my $viableEcNumbers = &getViableEcNumbers($proteinIds,$test,$trainingIds,$minNumGeneraForViableEc,$minNumProteinsForViableEc,$statsFh);
-  
-    my ($domainStatsPerEc,$domainPerProtein) = &ecDomainStats($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh);
-    my $lengthStatsPerEc = &ecLengthStats($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh);
-    my ($blastStatsPerEc,$blastStatsPerProtein,$proteinToEc) = &ecBlastStats($dbh,$group);
-
-    close $statsFh if ($statsFh);
-
-    my $scores =  &getProteinScores($proteinIds,$numTotalProteins,$numTotalGenera,$test,$testIds,$viableEcNumbers,$domainStatsPerEc,$domainPerProtein,$lengthStatsPerEc,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc,$outputFile,$dbh);
-
-     &testScores($scores,$proteinIds,$testIds,$totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh,$logFh) if ($test);
-}
-
-&printTest($totalEcsTested,$noEcMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) if ($test);
-
-$dbh->disconnect();
-
-exit;
-
-################################  SUBROUTINES  ########################################
-
-sub setUpPrediction {
-    my ($outputDirectory,$fractionOfGroups,$minNumProteinsWithEc,$maxNumProteinsWithEc,$createLogFile, $gusConfigFile) = @_;
-    my $dbh = &getDbHandle($gusConfigFile);
-    my $groups = &getGroupsFromDatabase($minNumProteinsWithEc,$maxNumProteinsWithEc,$dbh);
-
-    my $logFh;
-    my $numGroups = keys %{$groups};
-    my $netGroups = sprintf("%.0f",$fractionOfGroups * $numGroups);
-    if ($createLogFile) {
-	my $logFile = "$outputDirectory/log.txt";
-	open($logFh,">",$logFile) || die "Cannot open file '$logFile' for writing";
-    }
-    print {$logFh ? $logFh : *STDERR } "Start time: ".localtime()."\n";
-    print {$logFh ? $logFh : *STDERR } "Groups must have between $minNumProteinsWithEc and $maxNumProteinsWithEc proteins with EC numbers\n";
-    print {$logFh ? $logFh : *STDERR } "Total groups: $numGroups  Fraction: $fractionOfGroups  Net number of groups: $netGroups\n";
-    return ($logFh,\$netGroups,$dbh,$groups);
-}
-
-sub nextGroup {
-    my ($fractionOfGroups,$netGroupsRef,$logFh) = @_;
-    return 1 if (rand(1) >= $fractionOfGroups);
-    print {$logFh ? $logFh : *STDERR } "Approx number groups remaining: ${$netGroupsRef}\n";
-    ${$netGroupsRef}--;
-    return 0;
-}
-
-sub setUpTest {
-    my ($outputDirectory) = @_;
-    my $testFraction = 0.3;
-    my $totalEcsTested=0;
-    my $noEcMatch=0;
-    my $numProteinsExtraEc=0;
-    my $numExtraEc=0;
-    my $numTotalProteins=0;
-    my %testExact;
-    my %testLessPrecise;
-    my %testMorePrecise;
-    my $testFile =  "$outputDirectory/test.txt";
-    open(my $testFh,">",$testFile) || die "Cannot open $testFile for writing\n";
-    return (\$testFraction,\$totalEcsTested,\$noEcMatch,\%testExact,\%testLessPrecise,\%testMorePrecise,\$numProteinsExtraEc,\$numExtraEc,\$numTotalProteins,$testFh);
-}
-
-
-sub getTrainingAndTestIds {
-    my ($proteinIds,$testFraction) = @_;
-    my ($trainingIds,$testIds);
-
-    my $numProteinsWithEc = &numProteinsWithEc($proteinIds);
-    return ("","") if $numProteinsWithEc < 2;
-    my $uniqueEcNumbers = &getUniqueEcNumbersFromProteins($proteinIds,0,0);
-    foreach my $uniqueEc (keys %{$uniqueEcNumbers}) {
-	return ("","") if ($uniqueEcNumbers->{$uniqueEc} < 2);
-    }
-    my $numProteinsTest = sprintf("%.0f",${$testFraction} * $numProteinsWithEc);
-    $numProteinsTest = 1 if ($numProteinsTest < 1);
-    $numProteinsTest = $numProteinsWithEc - 1 if ($numProteinsTest == $numProteinsWithEc);
-    my $testIdPositions = &getTestIdPositions($numProteinsTest,$numProteinsWithEc);
-
-    my $proteinCounter = 1;
-    foreach my $id (keys %{$proteinIds}) {
-	next if (scalar @{$proteinIds->{$id}->{ec}} == 0);
-	if (exists $testIdPositions->{$proteinCounter}) {
-	    $testIds->{$id} = 1;
-	} else {
-	    $trainingIds->{$id} = 1;
-	}
-	$proteinCounter++;
-    }
-    return("","") if (&ecsNotInTrainingAndTest($proteinIds,$trainingIds,$testIds,$uniqueEcNumbers));
-    return ($trainingIds,$testIds);
-}
-
-sub ecsNotInTrainingAndTest {
-    my ($proteinIds,$trainingIds,$testIds,$uniqueEcNumbers) = @_;
-    foreach my $uniqueEc (keys %{$uniqueEcNumbers}) {
-	my $found=0;
-	foreach my $id (keys %{$trainingIds}) {
-	    foreach my $idEc (@{$proteinIds->{$id}->{ec}}) {
-		if ($idEc eq $uniqueEc) {
-		    $found=1;
-		    last;
-		}
-	    }
-	    last if ($found == 1);
-	}
-	return 1 if ($found == 0);
-	foreach my $id (keys %{$testIds}) {
-	    foreach my $idEc (@{$proteinIds->{$id}->{ec}}) {
-		if ($idEc eq $uniqueEc) {
-		    $found=2;
-		    last;
-		}
-	    }
-	    last if ($found == 2);
-	}
-	return 1 if ($found == 1);
-    }
-    return 0;
-}
-
-
-sub getTestIdPositions {
-    my ($numProteinsTest,$numProteinsWithEc) = @_;
-    my %testPositions;
-    while (keys %testPositions < $numProteinsTest ) {
-	my $randomNumber = int(rand($numProteinsWithEc)) + 1;
-	next if (exists $testPositions{$randomNumber});
-	$testPositions{$randomNumber} = 1;
-    }
-    return \%testPositions;
-}
-
-sub makeGroupDirAndFiles {
-    my ($outputDirectory,$group,$createStatsFile,$createProteinFile) = @_;
-    &makeDir("$outputDirectory/$group") if ($createStatsFile || $createProteinFile);
-    my $statsFh;
-    if ($createStatsFile) {
-	my $statsFile = "$outputDirectory/$group/stats.txt";
-	open($statsFh,">",$statsFile) || die "Cannot open $statsFile for writing\n";
-    }
-    my $proteinFile;
-    if ($createProteinFile) {
-	$proteinFile = "$outputDirectory/$group/proteins.txt";
-    }
-    return ($statsFh,$proteinFile);
-}
-
-sub abs_log10 {
-    my ($num) = @_;
-
-    # Handle zero or negative input                                                                                                        
-    return undef if $num <= 0;
-
-    # Take the log base 10 and return absolute value, rounded                                                 
-    return int(abs(log($num) / log(10)) + 0.5);
-}
-
-sub getAbbrev {
-    my ($dbh,$Id) = @_;
-    my $sql = "SELECT taxon_abbreviation FROM webready.ProteinSequenceGroup WHERE full_id = '$Id'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my $abbrev;
-    while(my $taxon_abbrev = $sh->fetchrow_array()) {
-        $abbrev = $taxon_abbrev
-    }
-    $sh->finish();
-
-    return $abbrev;
-}
-
-sub getProteinToEValue {
-    my ($dbh,$groupId) = @_;
-    my $sql = "SELECT qseq, evalue FROM apidb.orthologgroupblastvalue WHERE group_id = '$groupId'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my %proteinToEValue;
-    while(my ($protein,$evalue) = $sh->fetchrow_array()) {
-        if ($evalue eq '0.0') {
-            $evalue = 1.0e-300;
-	}
-        $proteinToEValue{$protein} = abs_log10($evalue);
-    }
-    $sh->finish();
-
-    return \%proteinToEValue;
-}
-
-sub getProteinToEC {
-    my ($dbh,$groupId) = @_;
-    my $sql = "SELECT pda.full_id,ec.ec_number
-               FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda
-               WHERE ec.enzyme_class_id = aaec.enzyme_class_id 
-               AND aaec.aa_sequence_id = pda.aa_sequence_id
-               AND pda.group_name = '$groupId'";
-    my $sh = $dbh->prepare($sql);
-    $sh->execute();
-
-    my %proteinToEC;
-    while(my ($protein,$ec) = $sh->fetchrow_array()) {
-        $proteinToEC{$protein} = $ec;
-    }
-    $sh->finish();
-
-    return \%proteinToEC;
-}
-
-
-sub ecBlastStats {
-    my ($dbh,$group) = @_;
-
-    # Get all proteins from that group with an EC assignment
-    my %proteinToEc = %{ &getProteinToEC($dbh,$group) };
-
-    # Get all proteins that have a significant hit to the group centroid. Note, this is stored as the absolute log of the evalue.
-    my %proteinToEValue = %{ &getProteinToEValue($dbh,$group) };
-
-    my %ecBestEvalue;
-
-    # For every EC assignment in this group, record the best blast evalue to the centroid                                                                             
-    # For each protein                                                                     
-    foreach my $protein (keys %proteinToEc) {
-        # If we already have an protein with this EC, and it has a significant blast to the centroid
-        if ($ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) {
-            if ($ecBestEvalue{$proteinToEc{$protein}} < $proteinToEValue{$protein}) {
-                $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein};
-            }
-        }
-        # Another protein assigned this EC, but doesn't have a significant blast hit with the centroid
-        elsif ($ecBestEvalue{$proteinToEc{$protein}} && !$proteinToEValue{$protein}) {
-            next;
-        }
-        # First protein with this EC, and it has a significant blast value to the centroid
-        elsif (!$ecBestEvalue{$proteinToEc{$protein}} && $proteinToEValue{$protein}) {
-            $ecBestEvalue{$proteinToEc{$protein}} = $proteinToEValue{$protein};
-        }
-        # First protein with this EC, and it does not have a significant blast value to the centroid
-        else {
-            $ecBestEvalue{$proteinToEc{$protein}} = 5;
-        }
-    }
-
-    return (\%ecBestEvalue,\%proteinToEValue,\%proteinToEc);
-}
-
-sub proteinDoesNotExist {
-    my ($proteinName,$proteinIds,$statsFh,$missingHashRef) = @_;
-
-    if (! exists $proteinIds->{$proteinName}) {
-	if ($statsFh && ! exists $missingHashRef->{$proteinName}) {
-	    print $statsFh "ERROR: The protein '$proteinName' does not exist in the original protein ids\n";
-	    $missingHashRef->{$proteinName}=1;
-	}
-	return 1;
-    }
-    return 0;
-}
-
-sub numProteinsWithDomainAndEc {
-    my ($proteinIds) = @_;
-    my $numProteins = 0;
-    foreach my $id (keys %{$proteinIds}) {
-	my $numDomains = scalar @{$proteinIds->{$id}->{domain}};
-	my $numEcs = scalar @{$proteinIds->{$id}->{ec}};
-	if ($numDomains > 0 && $numEcs > 0) {
-	    $numProteins++;
-	}
-    }
-    return $numProteins;
-}
-
-sub numProteinsWithEc {
-    my ($proteinIds) = @_;
-    my $numProteins = 0;
-    foreach my $id (keys %{$proteinIds}) {
-	my $numEcs = scalar @{$proteinIds->{$id}->{ec}};
-	$numProteins++ if ($numEcs > 0);
-    }
-    return $numProteins;
-}
-
-sub getViableEcNumbers {
-    my ($proteinIds,$test,$trainingIds,$minNumGenera,$minNumProteins,$statsFh) = @_;
-
-    my $actualEcNumbers = &getUniqueEcNumbersFromProteins($proteinIds,$test,$trainingIds);
-    #my $allEcNumbers = &addPartialEcNumbers($actualEcNumbers);
-    my $ecNumbersWithCounts = &getNumProteinsGeneraForEachEc($proteinIds,$test,$trainingIds,$actualEcNumbers);
-    &deletePartialEcNumbers($ecNumbersWithCounts);
-    &deleteEcNumbersBelowMin($ecNumbersWithCounts,$minNumProteins,$minNumGenera);
-    &printViableEcNumbers($ecNumbersWithCounts,$statsFh) if ($statsFh);
-    
-    return $ecNumbersWithCounts;
-}
-
-sub printViableEcNumbers {
-    my ($ecNumbersWithCounts,$statsFh) = @_;
-    print $statsFh "EC_NUMBER\tNUM_PROTEINS\tNUM_GENERA\n";
-    foreach my $ec (sort keys %{$ecNumbersWithCounts}) {
-	print $statsFh "$ec\t$ecNumbersWithCounts->{$ec}->{numProteins}\t$ecNumbersWithCounts->{$ec}->{numGenera}\n";
-    }
-    print $statsFh "\n";
-}
-
-sub getUniqueEcNumbersFromProteins {
-    my ($proteinIds,$test,$trainingIds) = @_;
-    my $ecs;
-    foreach my $id (keys %{$proteinIds}) {
-	next if ($test && ! exists $trainingIds->{$id});
-	foreach my $ec ( @{$proteinIds->{$id}->{ec}} ) {
-	    $ecs->{$ec}++;
-	}
-    }
-    return $ecs;
-}
-
-sub validEcNumber {
-    my ($ec) = @_;
-    if ( $ec =~ /^[0-9]+\.[0-9]+\.[0-9\-]+\.[0-9\-]+$/ ) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-sub addPartialEcNumbers {
-    my ($ecs) = @_;
-    my $allEcs;
-    foreach my $ec (keys %{$ecs}) {
-	$allEcs->{$ec} = 1;
-	my ($a,$b,$c,$d) = split(/\./,$ec);
-	$allEcs->{"$a.$b.$c.-"} = 1;
-	$allEcs->{"$a.$b.-.-"} = 1;
-    }
-    return $allEcs;
-}
-
-sub getNumProteinsGeneraForEachEc {
-    my ($proteinIds,$test,$trainingIds,$allEcNumbers) = @_;
-    my $ecNumbersWithCounts;
-    my $ecGenera;
-    foreach my $id (keys %{$proteinIds}) {
-	next if ($test && ! exists $trainingIds->{$id});
-	my $genus = &getGenusFromProtein($proteinIds->{$id});
-	foreach my $ec ( keys %{$allEcNumbers} ) {
-	    if (&proteinHasThisEcNumber($proteinIds->{$id},$ec)) {
-		$ecNumbersWithCounts->{$ec}->{numProteins}++;
-		$ecGenera->{$ec} = &addArrayElement($ecGenera->{$ec},$genus);
-	    }
-	}
-    }
-    
-    &addNumberGenera($ecNumbersWithCounts,$ecGenera);
-    return $ecNumbersWithCounts;
-}
-
-sub getGenusFromProtein {
-    my ($protein) = @_;
-    my @taxonArray = split(" ",$protein->{taxon});
-    return $taxonArray[0];
-}
-
-sub addArrayElement {
-    my ($arrayRef,$element) = @_;
-    if (defined $arrayRef) {
-	push @{$arrayRef}, $element;
-    } else {
-	$arrayRef = [$element];
-    }
-    return $arrayRef;
-}
-
-sub addNumberGenera {
-    my ($ecNumbersWithCounts,$ecGenera) = @_;
-    foreach my $ec (keys %{$ecNumbersWithCounts}) {
-	my %allGenera = map { $_ => 1 } @{$ecGenera->{$ec}};
-	my $numGenera = scalar keys %allGenera;
-	$ecNumbersWithCounts->{$ec}->{numGenera} = $numGenera
-    }
-}
-
-sub deleteEcNumbersBelowMin {
-    my ($ecs,$minNumProteins,$minNumGenera) = @_;
-    my %toDelete;
-    foreach my $ec (keys %{$ecs}) {
-	if ($ecs->{$ec}->{numProteins} < $minNumProteins || $ecs->{$ec}->{numGenera} < $minNumGenera) {
-	    $toDelete{$ec} = 1;
-	}
-    }
-    foreach my $ec (keys %toDelete) {
-	delete $ecs->{$ec};
-    }
-}
-
-sub deletePartialEcNumbers {
-    my ($ecs) = @_;
-    my %toDelete;
-    foreach my $ec (keys %{$ecs}) {
-	my $parentEc = &getParent($ec);
-	if ($parentEc && exists $ecs->{$parentEc}) {
-	    if ($ecs->{$ec}->{numProteins} == $ecs->{$parentEc}->{numProteins}) {
-		$toDelete{$parentEc} = 1;      # if parent has same number proteins then delete parent
-	    }
-	}
-    }
-    foreach my $ec (keys %toDelete) {
-	delete $ecs->{$ec};
-    }
-}
-
-sub getParent {
-    my ($ec) = @_;
-    return "" if ($ec eq "");
-    my ($a,$b,$c,$d) = split(/\./,$ec);
-    return "" if ($c eq "-");
-    return "$a.$b.-.-" if ($d eq "-");
-    return "$a.$b.$c.-";
-}
-
-sub getAllAncestors {
-    my ($ec,$ancestorEcs) = @_;
-    my $parentEc = &getParent($ec);
-    if ($parentEc ne "") {
-	$ancestorEcs->{$parentEc} = 1;
-	&getAllAncestors($parentEc,$ancestorEcs);
-    }
-}
-
-sub readDomainCountFile {
-    my ($domainCountFile) = @_;
-    my $domainCount;
-
-    open(IN,$domainCountFile) || die "Cannot open $domainCountFile\n";
-    my $header = <IN>;
-    my $totalLine = <IN>;
-    chomp($totalLine);
-    my ($text,$numProteins) = split("\t",$totalLine);
-    $domainCount->{numProteins} = $numProteins;
-    while (my $line=<IN>) {
-	chomp($line);
-	my ($domain,$num) = split("\t",$line);
-	$domainCount->{domain}->{$domain} = $num;
-    }
-    close IN;
-    return $domainCount;
-}
-
-sub ecLengthStats {
-    my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh) = @_;
-
-    # for each viable EC number, obtain array of protein lengths
-    my $ecNumbers;
-    foreach my $id (keys %{$proteinIds}) {
-	next if ( scalar @{$proteinIds->{$id}->{ec}} == 0 );
-	next if ($test && ! exists $trainingIds->{$id});
-	foreach my $viableEc ( keys %{$viableEcNumbers} ) {
-	    if (&proteinHasThisEcNumber($proteinIds->{$id},$viableEc)) {
-		$ecNumbers->{$viableEc} = &addArrayElement($ecNumbers->{$viableEc},$proteinIds->{$id}->{length});
-	    }
-	}
-    }
-
-    # for each EC number, calculate min, max, avg, median
-    my $ecStats;
-    foreach my $ec (keys %{$ecNumbers}) {
-	my $noValues = scalar @{$ecNumbers->{$ec}} == 0 ? 1 : 0;
-	$ecStats->{$ec}->{numProteins} = scalar @{$ecNumbers->{$ec}};
-	$ecStats->{$ec}->{min} = $noValues ? -1 : min($ecNumbers->{$ec});
-	$ecStats->{$ec}->{max} = $noValues ? -1 : max($ecNumbers->{$ec});
-	$ecStats->{$ec}->{median} = $noValues ? -1 : median($ecNumbers->{$ec});
-	my ($mean,$sd) = meanSd($ecNumbers->{$ec});
-	$ecStats->{$ec}->{mean} = $noValues ? -1 : $mean;
-	$ecStats->{$ec}->{sd} = $noValues ? -1 : $sd;
-    }
-
-    &printEcLengthStats($ecStats,$statsFh) if ($statsFh);
-
-    return $ecStats;
-}
-
-sub printEcLengthStats {
-    my ($ecStats,$statsFh) = @_;
-    print $statsFh "EC_NUMBER\tNUM_PROTEINS\tMIN_LENGTH\tMAX_LENGTH\t_MEDIAN_LENGTH\tMEAN_LENGTH\tSTD_DEV_LENGTH\n";
-    foreach my $ec (sort keys %{$ecStats}) {
-	    print $statsFh "$ec\t$ecStats->{$ec}->{numProteins}";
-	    print $statsFh "\t$ecStats->{$ec}->{min}\t$ecStats->{$ec}->{max}";
-	    print $statsFh "\t$ecStats->{$ec}->{median}\t$ecStats->{$ec}->{mean}";
-	    print $statsFh "\t$ecStats->{$ec}->{sd}\n";
-    }
-    print $statsFh "\n";
-}
-
-sub min {
-    my ($arrayRef) = @_;
-    my $min = $arrayRef->[0];
-    foreach my $number (@{$arrayRef}) {
-	$min = $number if ($number < $min);
-    }
-    return $min;
-}
-
-sub max {
-    my ($arrayRef) = @_;
-    my $max = $arrayRef->[0];
-    foreach my $number (@{$arrayRef}) {
-	$max = $number if ($number > $max);
-    }
-    return $max;
-}
-
-sub meanSd {
-    my ($arrayRef) = @_;
-    my $numSamples = scalar @{$arrayRef};
-    my $sum = &sum($arrayRef);
-    my $mean = $sum/$numSamples;
-
-    $sum=0;
-    foreach my $number (@{$arrayRef}) {
-	$sum += ($number-$mean)**2;
-    }
-    my $sd = sqrt($sum/$numSamples);
-
-    return ($mean,$sd);
-}
-
-sub sum {
-    my ($arrayRef) = @_;
-    my $sum = 0;
-    foreach my $number (@{$arrayRef}) {
-        $sum += $number;
-    }
-    return $sum;
-}
-
-sub median {
-    my ($arrayRef) = @_;
-    my @sorted = sort {$a <=> $b} @{$arrayRef};
-    my $length = scalar @{$arrayRef};    
-    if ( $length%2 ) {
-	return $sorted[int($length/2)];
-    } else {
-        return ($sorted[int($length/2)-1] + $sorted[int($length/2)])/2;
-    }
-}
-
-sub proteinHasThisEcNumber {
-    my ($protein,$ecNumber) = @_;
-    my ($a1,$b1,$c1,$d1) = split(/\./,$ecNumber);
-    foreach my $currentEc ( @{$protein->{ec}} ) {
-	my ($a2,$b2,$c2,$d2) = split(/\./,$currentEc);
-	$c2 = "-" if ($c1 eq "-");
-	$d2 = "-" if ($d1 eq "-");
-	if (($a1 eq $a2) && ($b1 eq $b2) && ($c1 eq $c2) && ($d1 eq $d2)) {
-	    return 1;
-	}
-    }
-    return 0;
-}
-
-sub ecDomainStats {
-    my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$statsFh) = @_;
-
-    my $domainToLetter = &getDomainKey($proteinIds,$statsFh);
-       
-    my ($ecNumbers,$domainPerProtein) = &getAllDomainStringsPerEc($proteinIds,$test,$trainingIds,$viableEcNumbers,$domainToLetter);
-    &calculateDomainNumAndScore($ecNumbers,$viableEcNumbers);
-    &calculateDomainMaxScore($ecNumbers);
-
-    &printEcDomainStats($ecNumbers,$statsFh) if ($statsFh);
-    &printProteinDomains($domainPerProtein,$statsFh) if ($statsFh);
-
-    return ($ecNumbers,$domainPerProtein);
-}
-
-sub printProteinDomains {
-    my ($domainPerProtein,$statsFh) = @_;
-    print $statsFh "GENE\tDOMAINS\n";
-    foreach my $id (keys %{$domainPerProtein}) {
-	my $domainString = $domainPerProtein->{$id};
-	print $statsFh "$id\t$domainString\n";
-    }
-    print $statsFh "\n";
-}
-
-sub printEcDomainStats {
-    my ($ecNumbers,$statsFh) = @_;
-    print $statsFh "EC_NUMBER\tDOMAIN_STRING\tNUM_PROTEINS\tSCORE\n";
-    foreach my $ec (sort keys %{$ecNumbers}) {
-	print $statsFh "$ec\t--NUM PROTEINS--\t$ecNumbers->{$ec}->{numProteins}\t$ecNumbers->{$ec}->{maxScore}\n";
-	foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) {
-	    print $statsFh "$ec\t$string";
-	    print $statsFh "\t$ecNumbers->{$ec}->{domainString}->{$string}->{count}";
-	    print $statsFh "\t$ecNumbers->{$ec}->{domainString}->{$string}->{score}\n";
-	}
-    }
-    print $statsFh "\n";
-}
-
-sub getAllDomainStringsPerEc {
-    my ($proteinIds,$test,$trainingIds,$viableEcNumbers,$domainToLetter) = @_;
-    my ($ecNumbers,$domainPerProtein);
-
-    foreach my $id (keys %{$proteinIds}) {
-	my $domainString = &getDomain($proteinIds->{$id},$domainToLetter);
-	$domainPerProtein->{$id} = $domainString;
-	next if (scalar @{$proteinIds->{$id}->{ec}} == 0);
-	next if ($test && ! exists $trainingIds->{$id});
-	my $domains = &getAllPossibleCombinations($domainString,"");
-	foreach my $viableEc ( keys %{$viableEcNumbers} ) {
-	    if (&proteinHasThisEcNumber($proteinIds->{$id},$viableEc)) {
-		foreach my $domain (keys %{$domains}) {
-		    $ecNumbers->{$viableEc}->{domainString}->{$domain}->{count}++;
-		}
-	    }
-	}
-    }
-    return ($ecNumbers,$domainPerProtein);
-}
-
-sub calculateDomainNumAndScore {
-    my ($ecNumbers,$viableEcNumbers) = @_;
-    foreach my $ec (keys %{$ecNumbers}) {
-	$ecNumbers->{$ec}->{numProteins} = $viableEcNumbers->{$ec}->{numProteins};
-	foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) {
-	    $ecNumbers->{$ec}->{domainString}->{$string}->{score} =  $ecNumbers->{$ec}->{domainString}->{$string}->{count} / $viableEcNumbers->{$ec}->{numProteins};
-	}
-    }
-}
-
-sub calculateDomainMaxScore {
-    my ($ecNumbers) = @_;
-    foreach my $ec (keys %{$ecNumbers}) {
-	$ecNumbers->{$ec}->{maxScore} = 0;
-	foreach my $string ( keys %{$ecNumbers->{$ec}->{domainString}} ) {
-	    $ecNumbers->{$ec}->{maxScore} += $ecNumbers->{$ec}->{domainString}->{$string}->{score};
-	}
-    }
-}
-
-sub getDomain {
-    my ($proteinRef,$domainToLetter) = @_;
-    my $domain = "-";
-    if (scalar @{$proteinRef->{domain}} > 0) {
-	my @domainLetters = map { $domainToLetter->{$_} } @{$proteinRef->{domain}};
-	$domain = join("",@domainLetters);
-    }
-    return $domain;
-}
-
-sub numProteinScore {
-    my ($numProteinsWithEc) = @_;
-    if ($numProteinsWithEc > 1) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-sub numGeneraScore {
-    my ($numGeneraWithEc) = @_;
-    if ($numGeneraWithEc > 1) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-sub domainScore {
-    my ($id,$ec,$domainStatsPerEc,$domainPerProtein) = @_;
-    my $score=0;
-    my $idDomain = $domainPerProtein->{$id};
-    die "There is no domain for $id. Testing EC $ec.\n" if (! defined $idDomain);
-    die "There is no domainStatsPerEc for $ec. Testing $id.\n" if (! exists $domainStatsPerEc->{$ec});
-
-    foreach my $domainString ( keys %{$domainStatsPerEc->{$ec}->{domainString}} ) {
-	if ($idDomain =~ /$domainString/) {
-	    $score += $domainStatsPerEc->{$ec}->{domainString}->{$domainString}->{score};
-	}
-    }
-    my $normalizedScore = $score / $domainStatsPerEc->{$ec}->{maxScore};
-    if ($normalizedScore > 0.8 ) {
-	return 4;
-    } elsif ($normalizedScore > 0.6) {
-	return 3;
-    } elsif ($normalizedScore > 0.4) {
-	return 2;
-    } elsif ($normalizedScore > 0.2) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-sub lengthScore {
-    my ($id,$ec,$proteinIds,$lengthStatsPerEc) = @_;
-
-    my $idLength = $proteinIds->{$id}->{length};
-    my $idDistanceFromMedian = abs($idLength - $lengthStatsPerEc->{$ec}->{median});
-    my $tenPercentOfMedian = 0.1 * $lengthStatsPerEc->{$ec}->{median};
-    
-    if ($idDistanceFromMedian <= $tenPercentOfMedian ) {
-	return 4;
-    } elsif ($idDistanceFromMedian <= 2*$tenPercentOfMedian) {
-	return 3;
-    } elsif ($idDistanceFromMedian <= 3*$tenPercentOfMedian) {
-	return 2;
-    } elsif ($idLength >= $lengthStatsPerEc->{$ec}->{min} && $idLength <= $lengthStatsPerEc->{$ec}->{max}) {
-	return 1;
-    } else {
-	return 0;
-    }
-}
-
-sub blastScore {
-    my ($id,$ec,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc) = @_;
-    #print "EC is $ec ID is $id\n";
-    # This protein was already assigned this EC
-    if ($proteinToEc->{$id} && $proteinToEc->{$id} eq $ec) {
-        return 4;
-    }
-    # This protein has no significant hit to the best representative
-    if (!$blastStatsPerProtein->{$id}) {
-	return 0;
-    }
-    # This protein did not have a good hit to the best rep
-    elsif ($blastStatsPerProtein->{$id} < 10) {
-	return 0;
-    }
-    # If the protein to centroid blast was > 100
-    elsif ($blastStatsPerProtein->{$id} > 100) {
-	if (!$blastStatsPerEc->{$ec}) {
-            return 0;
-        }
-        # And the best blast to the centroid for all proteins with this EC was > 100
-	elsif ($blastStatsPerEc->{$ec} > 100) {
-            # This protein EC combo scores a 3
-	     return 3;
-	    }
-        # If the best blast to the centroid for all proteins with this EC was > 50
-	elsif ($blastStatsPerEc->{$ec} > 50) {
-            # This protein EC combo scores a 2
-	    return 2;
-	}
-        # If the best blast to the centroid for all proteins with this EC was >= 10
-	elsif ($blastStatsPerEc->{$ec} >= 10) {
-            # This protein EC combo scores a 1
-	    return 1;
-	}
-    }
-    # Same process for this but this protein EC combo can only score a max of a 2
-    elsif ($blastStatsPerProtein->{$id} > 50) {
-	if (!$blastStatsPerEc->{$ec}) {
-            return 0;
-        }
-	elsif ($blastStatsPerEc->{$ec} > 50) {
-	    return 2;
-	}
-	elsif ($blastStatsPerEc->{$ec} >= 10) {
-	    return 1;
-	}
-    }
-    # Same process for this but this protein EC combo can only score a max of a 1
-    elsif ($blastStatsPerProtein->{$id} >= 10) {
-	if (!$blastStatsPerEc->{$ec}) {
-            return 0;
-        }
-	if ($blastStatsPerEc->{$ec} >= 10) {
-	    return 1;
-	}
-    }
-}
-
-sub getProteinScores {
-    my ($proteinIds,$numTotalProteins,$numTotalGenera,$test,$testIds,$viableEcNumbers,$domainStatsPerEc,$domainPerProtein,$lengthStatsPerEc,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc,$outputFile,$dbh) = @_;
-
-    my $scores;
-    foreach my $id (keys %{$proteinIds}) {
-	next if ($test && ! exists $testIds->{$id});
-	foreach my $ec (keys %{$viableEcNumbers}) {
-	    $scores->{$id}->{$ec}->{numProteinsWithEc} = $viableEcNumbers->{$ec}->{numProteins};
-	    $scores->{$id}->{$ec}->{numGeneraWithEc} = $viableEcNumbers->{$ec}->{numGenera};
-	    $scores->{$id}->{$ec}->{lengthScore} = &lengthScore($id,$ec,$proteinIds,$lengthStatsPerEc);
-	    $scores->{$id}->{$ec}->{blastScore} = &blastScore($id,$ec,$blastStatsPerEc,$blastStatsPerProtein,$proteinToEc);
-	    $scores->{$id}->{$ec}->{domainScore} = &domainScore($id,$ec,$domainStatsPerEc,$domainPerProtein);
-	    $scores->{$id}->{$ec}->{numTotalProteins} = $numTotalProteins;
-	    $scores->{$id}->{$ec}->{numTotalGenera} = $numTotalGenera;
-	    delete $scores->{$id}->{$ec} unless (&meetsScoreThreshold($scores->{$id}->{$ec}));
-	}
-    }
-    &printEcScores($scores,$outputFile,$dbh) if (!$test);
-    return $scores;
-}
-
-sub meetsScoreThreshold {
-    my ($scoreRef) = @_;
-    if ($scoreRef->{lengthScore} >= 1 && $scoreRef->{blastScore} >= 1 && $scoreRef->{domainScore} >= 1) {
-	return 1;
-    }
-    return 0;
-}
-
-
-sub deletePartialEcWithWorseScore {
-    my ($scores) = @_;
-    foreach my $id (keys %{$scores}) {
-	my %toDelete;
-	foreach my $ec (keys %{$scores->{$id}}) {
-	    my %ancestorEcs;
- 	    &getAllAncestors($ec,\%ancestorEcs);
-	    next if (keys %ancestorEcs == 0);
-	    
-	    foreach my $ancestorEc (keys %ancestorEcs) {
-		if (exists $scores->{$id}->{$ancestorEc} && &sameOrWorseScore($scores->{$id},$ec,$ancestorEc)) {
-		    $toDelete{$ancestorEc} = 1;
-		}
-	    }
-	}
-
-	foreach my $ec (keys %toDelete) {
-	    delete $scores->{$id}->{$ec};
-	}
-    }
-}
-
-sub sameOrWorseScore {
-    my ($scoresRef,$ec1,$ec2) = @_;
-    return 1 if ($scoresRef->{$ec1}->{numProteinsWithEc} >= 2 &&
-		 $scoresRef->{$ec1}->{numGeneraWithEc} >= 2 &&
-		 $scoresRef->{$ec2}->{lengthScore} <=
-		 $scoresRef->{$ec1}->{lengthScore} &&
-		 $scoresRef->{$ec2}->{blastScore} <=
-		 $scoresRef->{$ec1}->{blastScore} &&
-		 $scoresRef->{$ec2}->{domainScore} <=
-		 $scoresRef->{$ec1}->{domainScore});
-    return 0;
-}
-
-sub scoreToNumber {
-    my ($scoreRef) = @_;
-    my $score = $scoreRef->{domainScore} == -1 ? 0 : $scoreRef->{domainScore}/10;
-    $score += $scoreRef->{lengthScore} + $scoreRef->{blastScore};
-    return $score;
-}
-
-sub printEcScores {
-    my ($scores,$outputFile,$dbh) = @_;
-    foreach my $id (keys %{$scores}) {
-	my $editedId = $id;
-	my $abbrev;
-	if ($id =~ /^([a-z]{4})\|/) {
-	    $abbrev = $1;
-	} else {
-            $abbrev = &getAbbrev($dbh,$id);
-	}
-	if ($abbrev eq "rhiz") { # this is temporary, because rhiz on orthomcl equals rirr on fungidb
-	    $abbrev = "rirr";
-	    $editedId =~ s/^rhiz/rirr/;
-	}
-      	open(OUT,">>",$outputFile) || die "Cannot open file '$outputFile' for appending";
-	foreach my $ec (keys %{$scores->{$id}}) {
-	    my $string = &scoresToString($scores->{$id}->{$ec});
-	    my $text = "$editedId\t$ec\t$string\n";
-	    print OUT $text;
-	}
-        close OUT;
-    }
-}
-
-sub scoresToString {
-    my ($scoresRef) = @_;
-    my $a = $scoresRef->{numProteinsWithEc};
-    my $b = $scoresRef->{numTotalProteins};
-    my $c = $scoresRef->{numGeneraWithEc};
-    my $d = $scoresRef->{numTotalGenera};
-    my $e = $scoresRef->{lengthScore};
-    my $f = $scoresRef->{blastScore};
-    my $g = $scoresRef->{domainScore};
-    return "$a\t$b\t$c\t$d\t$e\t$f\t$g";
-}
-
-sub testScores {
-    my ($scores,$proteinIds,$testIds,$totalEcsRef,$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh,$logFh) = @_;
-    my $group = &getGroupFromProteins($proteinIds,$logFh);
-    print $testFh "Group $group\n";
-    foreach my $id (keys %{$testIds}) {
-	foreach my $thisIdEc (@{$proteinIds->{$id}->{ec}}) {
-	    ${$totalEcsRef}++;
-	    print $testFh "$id\t$thisIdEc\t";
-	    &testEcNumberMatch($thisIdEc,$scores->{$id},$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$testFh);
-	}
-	&testExtraEc($id,$proteinIds->{$id}->{ec},$scores->{$id},$numProteinsExtraEcRef,$numExtraEcRef,$testFh);
-	${$numTotalProteinsRef}++;
-    }
-}
-
-sub testExtraEc {
-    my ($id,$idEcs,$scoresForThisId,$numProteinsExtraEcRef,$numExtraEcRef,$testFh) = @_;
-    my $proteinNoMatch = 0;
-    foreach my $predictedEc (keys %{$scoresForThisId}) {
-	my $ecMatch = 0;
-	foreach my $idEc (@{$idEcs}) {
-	    if ($idEc eq $predictedEc) {
-		$ecMatch = 1;
-		last;
-	    }
-	}
-	if ($ecMatch == 0) {
-	    $proteinNoMatch = 1;
-	    my $string = &scoresToString($scoresForThisId->{$predictedEc});
-	    print $testFh "$id\t$predictedEc\t$string\tnew_assignment\n";
-	    ${$numExtraEcRef}++;
-	}
-    }
-    ${$numProteinsExtraEcRef}++ if ($proteinNoMatch == 1);
-}
-
-sub testEcNumberMatch {
-    my ($thisIdEc,$scoresForThisId,$noEcMatchRef,$testExact,$testLessPrecise,$testMorePrecise,$testFh) = @_;
-    if (exists $scoresForThisId->{$thisIdEc}) {   #exact match
-	my $string = &scoresToString($scoresForThisId->{$thisIdEc});
-	my $score = &scoreToNumber($scoresForThisId->{$thisIdEc});
-	$testExact->{$score}++;
-	print $testFh "$string\texact\n";
-	return;
-    }
-    if (&testMorePrecise($thisIdEc,$scoresForThisId,$testMorePrecise,$testFh)) {
-	return;
-    }
-    if (&testLessPrecise($thisIdEc,$scoresForThisId,$testLessPrecise,$testFh)) {
-	return;
-    }
-    &testNoMatch($noEcMatchRef,$scoresForThisId,$testFh);
-}
-
-sub testNoMatch {
-    my ($noEcMatchRef,$scoresForThisId,$testFh) = @_;
-    ${$noEcMatchRef}++;
-    print $testFh "\t\t\t\t\t\t\tno_match,ECs_predicted: ";
-    foreach my $thisIdEc (sort keys %{$scoresForThisId}) {
-	my $score = &scoreToNumber($scoresForThisId->{$thisIdEc});
-	print $testFh "$thisIdEc($score)";
-    }
-    print $testFh "\n";
-}
-
-sub testLessPrecise {
-    my ($thisIdEc,$scoresForThisId,$testLessPrecise,$testFh) = @_;
-    my $thisIdParentEc = &getParent($thisIdEc);
-    if (! $thisIdParentEc) {
-	return 0;
-    } elsif (exists $scoresForThisId->{$thisIdParentEc}) {
-	my $string = &scoresToString($scoresForThisId->{$thisIdParentEc});
-	my $score = &scoreToNumber($scoresForThisId->{$thisIdParentEc});	
-	print $testFh "$string\tprediction_less_precise: $thisIdParentEc\n";
-	$testLessPrecise->{$score}++;
-	return 1;
-    } else {
-	return &testLessPrecise($thisIdParentEc,$scoresForThisId,$testLessPrecise,$testFh);
-    }
-}
-
-sub testMorePrecise {
-    my ($thisIdEc,$scoresForThisId,$testMorePrecise,$testFh) = @_;
-    foreach my $predictedEc (keys %{$scoresForThisId}) {
-	if (&partialMatchEc($thisIdEc,$predictedEc)) {
-	    my $string = &scoresToString($scoresForThisId->{$predictedEc});
-	    my $score = &scoreToNumber($scoresForThisId->{$predictedEc});	
-	    print $testFh "$string\tprediction_more_precise: $predictedEc\n";
-	    $testMorePrecise->{$score}++;
-	    return 1;
-	}
-    }
-    return 0;
-}
-
-sub partialMatchEc {
-    my ($thisIdEc,$predictedEc) = @_;
-    my $predictedParentEc = &getParent($predictedEc);
-    if (! $predictedParentEc) {
-	return 0;
-    } elsif ($thisIdEc eq $predictedParentEc) {
-	return 1;
-    } else {
-	return &partialMatchEc($thisIdEc,$predictedParentEc);
-    }
-}
- 
-sub printTest {
-    my ($total,$noMatch,$testExact,$testLessPrecise,$testMorePrecise,$numProteinsExtraEcRef,$numExtraEcRef,$numTotalProteinsRef,$testFh) = @_;
-    my $percentNoMatch = sprintf("%.1f",100 * ${$noMatch} / ${$total});
-    my $numExactMatch = &sumHashValues($testExact);
-    my $percentExactMatch = sprintf("%.1f",100 * $numExactMatch / ${$total});
-    my $numLessPreciseMatch = &sumHashValues($testLessPrecise);
-    my $percentLessPreciseMatch = sprintf("%.1f",100 * $numLessPreciseMatch / ${$total});
-    my $numMorePreciseMatch = &sumHashValues($testMorePrecise);
-    my $percentMorePreciseMatch = sprintf("%.1f",100 * $numMorePreciseMatch / ${$total});
-    my $percentProteinsExtraEc = sprintf("%.1f",100 * ${$numProteinsExtraEcRef} / ${$numTotalProteinsRef});
-    print $testFh "\nSUMMARY\n";
-    print $testFh "Total tested ECs\t${$total}\n";
-    print $testFh "No match\t${$noMatch} ($percentNoMatch %)\n";
-    print $testFh "Exact match\t$numExactMatch ($percentExactMatch %)\n";
-    print $testFh "Less precise match\t$numLessPreciseMatch ($percentLessPreciseMatch %)\n";
-    print $testFh "More precise match\t$numMorePreciseMatch ($percentMorePreciseMatch %)\n";
-    print $testFh "Extra ECs\t${$numExtraEcRef}\n";
-    print $testFh "\nTotal tested proteins\t${$numTotalProteinsRef}\n";
-    print $testFh "Num proteins w/ extra predicted EC\t${$numProteinsExtraEcRef} ($percentProteinsExtraEc %)\n";
-    print $testFh "\nExact predictions:\n";
-    foreach my $score (sort { $b cmp $a } keys %{$testExact}) {
-	print $testFh "$score\t$testExact->{$score}\n";
-    }
-    print $testFh "Less precise predictions:\n";
-    foreach my $score (sort { $b cmp $a } keys %{$testLessPrecise}) {
-	print $testFh "$score\t$testLessPrecise->{$score}\n";
-    }
-    print $testFh "More precise predictions:\n";
-    foreach my $score (sort { $b cmp $a } keys %{$testMorePrecise}) {
-	print $testFh "$score\t$testMorePrecise->{$score}\n";
-    }
-
-    close $testFh;
-}
-
-sub sumHashValues {
-    my ($hashRef) = @_;
-    my $sum = 0;
-    foreach my $key (keys %{$hashRef}) {
-	$sum += $hashRef->{$key};
-    }
-    return $sum;
-}
-    
-sub getAllPossibleCombinations {
-    my ($domainString,$delimiter) = @_;
-
-    my $domains;
-    my @domainArray = split(/$delimiter/,$domainString);
-    my $length = scalar @domainArray;
-    for (my $a=0; $a<$length; $a++) {
-	for (my $b=$a; $b<$length; $b++) {
-	    my $string = join("$delimiter",@domainArray[$a..$b]);
-	    $domains->{$string} = 1;
-	}
-    }
-    return $domains;
-}
-
-
-sub getDomainKey {
-    my ($proteinIds,$statsFh) = @_;
-    
-    my $domains;
-
-    foreach my $protein (keys %{$proteinIds}) {
-	foreach my $domain (@{$proteinIds->{$protein}->{domain}}) {
-	    $domains->{$domain}++;
-	}
-    }
-
-    my @alphabet = ("A".."Z","a".."z",0..9);
-    my $numCharacters = scalar @alphabet;
-    my $numDomains = scalar keys %{$domains};
-    # stringLength is how long string needs to be in order to capture all domains
-    # 1 if <= 62 domains, 2 if <= 3844, 3 if <= 238328, 4 if <= 14776336  (max length is set to 4)
-    my $stringLength = 0;
-    for (my $a=1; $a<=4; $a++) {
-	if ($numDomains <= $numCharacters**$a) {
-	    $stringLength = $a;
-	    last;
-	}
-    }
-    die "There are more than 14,776,336 domains in this group so cannot perform A-Z a-z 0-9 mapping" if ($stringLength == 0);
-
-    my $stringCounter = &initializeStringCounter($stringLength);
-    my @domainArray = sort { $domains->{$b} <=> $domains->{$a} } keys %{$domains};
-    
-    print $statsFh "DOMAIN\tSTRING\n" if ($statsFh);
-    for (my $currentDomain=0; $currentDomain<$numDomains; $currentDomain++) {
-	my $currentString = &makeStringFromCounter($stringCounter,\@alphabet);
-	$domains->{$domainArray[$currentDomain]} = $currentString;
-	print $statsFh "$domainArray[$currentDomain]\t$currentString\n" if ($statsFh);
-	$stringCounter = &increaseStringCounter($stringCounter,$numCharacters);
-    }
-    print $statsFh "\n" if ($statsFh);
-    
-    return $domains;
-}
-
-sub initializeStringCounter {
-    my ($stringLength) = @_;
-    my $stringCounter;
-    for (my $a=0; $a<$stringLength; $a++) {
-	push @{$stringCounter}, 0;
-    }
-    return $stringCounter;
-}
-
-sub makeStringFromCounter {
-    my ($stringCounter,$alphabet) = @_;
-    my @string = map { $alphabet->[$_] } @{$stringCounter};
-    return join("",@string);
-}
-
-sub increaseStringCounter {
-    my ($stringCounter,$numCharacters) = @_;
-
-    my $numDigits = scalar @{$stringCounter};
-
-    for (my $i=($numDigits-1); $i>=0; $i--) {
-	if ( $stringCounter->[$i] == ($numCharacters-1) ) {
-	    $stringCounter->[$i] = 0;
-	} else {
-	    $stringCounter->[$i]++;
-	    last;
-	}
-    }
-    return $stringCounter;
-    
-}
-
-sub writeProteinInfoFile {
-    my ($proteinIds,$numTotalProteins,$numTotalGenera,$proteinInfoFile) =@_;
-
-    open(OUT,">",$proteinInfoFile) || die "Cannot open '$proteinInfoFile' for writing";
-    print OUT "Total num proteins\t$numTotalProteins\n";
-    print OUT "Total num genera\t$numTotalGenera\n";
-    print OUT "GROUP\tPROTEIN_ID\tTAXON\tLENGTH\tDOMAIN\tPRODUCT\tEC_NUMBER\n";
-    foreach my $id (keys %{$proteinIds}) {
-	my $group = $proteinIds->{$id}->{group};
-	my $length = $proteinIds->{$id}->{length};
-	my $product = $proteinIds->{$id}->{product};
-	my $taxon = $proteinIds->{$id}->{taxon};
-	my @ecs = @{$proteinIds->{$id}->{ec}};
-	$ecs[0] = "-" if (scalar @ecs == 0);
-	my $ec = join(",",@ecs);
-	my @domains = @{$proteinIds->{$id}->{domain}};
-	$domains[0] = "-" if (scalar @domains == 0);
-	my $domain = join(",",@domains);
-	print OUT "$group\t$id\t$taxon\t$length\t$domain\t$product\t$ec\n";
-    }
-    close OUT; 
-}
-
-sub readProteinInfoFile {
-    my ($proteinInfoFile,$excludeOld) = @_;
-    my $proteinIds;
-    open(IN,$proteinInfoFile) || die "Cannot open '$proteinInfoFile' for reading";
-    my $numTotalProteins = &getSecondColumn(<IN>);
-    my $numTotalGenera = &getSecondColumn(<IN>);
-    while (my $line = <IN>) {
-	chomp($line);
-	next if ($line !~ /^OG/);
-	my ($group,$id,$taxon,$length,$domains,$product,$ecs) = split("\t",$line);
-	next if ($excludeOld && $id =~ /-old\|/);
-	$proteinIds->{$id}->{group} = $group;
-	$proteinIds->{$id}->{taxon} = $taxon;
-	$proteinIds->{$id}->{length} = $length;
-	$proteinIds->{$id}->{product} = $product;
-	$ecs = $ecs eq "-" ? "" : $ecs;
-	$proteinIds->{$id}->{ec} = [split(",",$ecs)];
-	$domains = $domains eq "-" ? "" : $domains;
-	$proteinIds->{$id}->{domain} = [split(",",$domains)];
-    }
-    close IN;
-    return ($proteinIds,$numTotalProteins,$numTotalGenera);
-}
-
-sub getSecondColumn {
-    my ($line) = @_;
-    chomp $line;
-    my @lineArray = split("\t",$line);
-    return $lineArray[1];
-}
-
-
-sub getProteinInfo {
-    my ($group,$excludeOld,$dbh,$proteinInfoFile) = @_;
-
-    my $proteinIds;
-    my $numTotalProteins;
-    my $numTotalGenera;
-    
-    if ($proteinInfoFile && -e $proteinInfoFile) {
-	($proteinIds,$numTotalProteins,$numTotalGenera) = &readProteinInfoFile($proteinInfoFile,$excludeOld);
-    } else {
-	($proteinIds,$numTotalProteins,$numTotalGenera) = &getProteinsFromDatabase($proteinIds,$group,$excludeOld,$dbh);
-	$proteinIds = &getEcsFromDatabase($proteinIds,$group,$excludeOld,$dbh);
-	$proteinIds = &getDomainsFromDatabase($proteinIds,$group,$excludeOld,$dbh);
-	&writeProteinInfoFile($proteinIds,$numTotalProteins,$numTotalGenera,$proteinInfoFile) if ($proteinInfoFile);
-    }
-    return ($proteinIds,$numTotalProteins,$numTotalGenera);
-}
-
-sub getProteinsFromDatabase {
-    my ($proteinIds,$group,$excludeOld,$dbh) = @_;
-    my $numTotalProteins = 0;
-    my %genera;
-    my $query = $dbh->prepare(&proteinsSql($group));
-    $query->execute();
-    while (my($id,$product,$length,$corePeripheral,$group,$taxon) = $query->fetchrow_array()) {
-	next if ( ($excludeOld && $id =~ /-old\|/) || $id eq "");
-	$numTotalProteins++;
-	$proteinIds->{$id}->{product} = $product;
-	$proteinIds->{$id}->{length} = $length;
-	$proteinIds->{$id}->{corePeripheral} = $corePeripheral;
-	$proteinIds->{$id}->{group} = $group;
-	$proteinIds->{$id}->{taxon} = $taxon;
-	$proteinIds->{$id}->{ec} = [];
-	$proteinIds->{$id}->{domain} = [];
-	$genera{&getGenusFromProtein($proteinIds->{$id})} = 1;
-    }    
-    $query->finish();
-    my $numTotalGenera = keys %genera;
-    return ($proteinIds,$numTotalProteins,$numTotalGenera);
-}
-
-sub createIdString {
-    my ($proteinIds,$excludeOld) = @_;
-    my @ids;
-    foreach my $id (keys %{$proteinIds}) {
-	next if ($excludeOld && $id =~ /-old\|/);
-	push @ids, $id;
-    }
-    my $idString = join("','",@ids);
-    return "('".$idString."')";
-}
-
-sub getEcsFromDatabase {
-    my ($proteinIds,$group,$excludeOld,$dbh) = @_;
-
-    my $query = $dbh->prepare(&ecsSql($group));
-    
-    $query->execute();
-    while (my($id,$ecString) = $query->fetchrow_array()) {
-	next if ($excludeOld && $id =~ /-old\|/);
-	die "The protein '$id' was not found in group" if (! exists $proteinIds->{$id});
-	$ecString =~ s/ //g;
-	my @multipleEc = split(/[;,]/,$ecString);
-	foreach my $ec ( @multipleEc) {
-	    next if (! &validEcNumber($ec));
-	    push @{$proteinIds->{$id}->{ec}}, $ec;
-	}
-    }
-    $query->finish();
-    
-    # sort EC numbers for each protein
-    foreach my $id (keys %{$proteinIds}) {
-	@{$proteinIds->{$id}->{ec}} = sort @{$proteinIds->{$id}->{ec}};
-    }
-    
-    return $proteinIds;
-}
-
-sub getDomainsFromDatabase {
-    my ($proteinIds,$group,$excludeOld,$dbh) = @_;
-
-     my $query = $dbh->prepare(&domainsSql($group));
-
-    $query->execute();
-    while (my($id,$domain) = $query->fetchrow_array()) {
-	next if ($excludeOld && $id =~ /-old\|/);
-	die "The protein '$id' was not found in group" if (! exists $proteinIds->{$id});
-	push @{$proteinIds->{$id}->{domain}}, $domain;
-    }
-    $query->finish();
-    return $proteinIds;
-}
-
-sub writeDomainCountFile {
-    my ($domainCount,$domainCountFile) = @_;
-
-    open(OUT,">",$domainCountFile) || die "Cannot open $domainCountFile for writing\n";
-    print OUT "DOMAIN\tNUM_PROTEINS\n";
-    print OUT "all_proteins\t$domainCount->{numProteins}\n";
-    foreach my $domain (keys %{$domainCount->{domain}}) {
-	print OUT "$domain\t$domainCount->{domain}->{$domain}\n";
-    }
-    close OUT;
-}
-   
-sub getGroupFromProteins {
-    my ($proteinIds,$logFh) = @_;
-    my $group = "";
-    foreach my $protein (keys %{$proteinIds}) {
-	if ($group ne "" && $group ne $proteinIds->{$protein}->{group}) {
-	    die "expected only one ortholog group but obtained more than one: $group $proteinIds->{$protein}->{group}";
-	}
-	$group = $proteinIds->{$protein}->{group};
-    }
-    if ($group eq "") {
-	print {$logFh ? $logFh : *STDERR } "Did not obtain any ortholog groups from these proteins: ";
-	print {$logFh ? $logFh : *STDERR } "$_ " foreach (keys %{$proteinIds});
-	die;
-    }
-    return $group;
-}
-
-sub getGroupsFromDatabase {
-    my ($minNumProteinsWithEc,$maxNumProteinsWithEc,$dbh) = @_;
-    my %groups;
-    
-    my $query = $dbh->prepare(&groupsSql($minNumProteinsWithEc,$maxNumProteinsWithEc));
-
-    $query->execute();
-    while (my($group) = $query->fetchrow_array()) {
-        $groups{$group} = 1;
-    }
-    $query->finish();
-    return \%groups;
-}
-
-sub domainsSql {
-    my ($group) = @_;
-    return "SELECT full_id, accession 
-            FROM webready.ProteinDomainAssignment                                                                                                                               
-            WHERE group_name = '$group'";
-}
-
-sub ecsSql {
-    my ($group) = @_;
-    return  "SELECT pda.full_id,ec.ec_number 
-             FROM SRes.EnzymeClass ec, DoTS.AASequenceEnzymeClass aaec, webready.ProteinDomainAssignment pda 
-             WHERE ec.enzyme_class_id = aaec.enzyme_class_id 
-             AND aaec.aa_sequence_id = pda.aa_sequence_id
-             AND pda.group_name = '$group'"
-}
-
-sub proteinsSql {
-    my ($group) = @_;
-    return "SELECT full_id,product,length,core_peripheral,group_name,organism_name                                     
-            FROM webready.ProteinSequenceGroup WHERE group_name='$group'";
-}
-
-sub groupsSql {
-    my ($minNumProteinsWithEc,$maxNumProteinsWithEc) = @_;
-
-    return "SELECT DISTINCT(group_name) FROM webready.ProteinSequenceGroup";
-}
-
-sub orthoOrganismSql {
-    my ($excludeOld) = @_;
-    my $whereClause = $excludeOld ? "AND three_letter_abbrev NOT LIKE '%-old'" : "";
-    return "WITH RECURSIVE TaxonHierarchy AS (
-                SELECT
-                    three_letter_abbrev,
-                    orthomcl_clade_id,
-                    core_peripheral,
-                    three_letter_abbrev AS taxon_group_abbrev
-                FROM apidb.OrthomclClade
-                WHERE core_peripheral = 'Z'
-
-                UNION ALL
-
-                SELECT
-                    child.three_letter_abbrev,
-                    child.orthomcl_clade_id,
-                    child.core_peripheral,
-                    parent.taxon_group_abbrev
-                FROM apidb.OrthomclClade child
-                JOIN TaxonHierarchy parent
-                  ON child.parent_id = parent.orthomcl_clade_id
-            )
-            SELECT three_letter_abbrev, taxon_group_abbrev
-            FROM TaxonHierarchy
-            WHERE core_peripheral IN ('C','P')
-                  $whereClause";
-}
-
-sub getDbHandle {
-
-  #my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config";
-  my ($gusConfigFile) = @_;
-  my @properties = ();
-  my $gusconfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1);
-
-  my $u = $gusconfig->{props}->{databaseLogin};
-  my $pw = $gusconfig->{props}->{databasePassword};
-  my $dsn = $gusconfig->{props}->{dbiDsn};
-
-  my $dbh = DBI->connect($dsn, $u, $pw) or die DBI::errstr;
-  $dbh->{RaiseError} = 1;
-  $dbh->{AutoCommit} = 0;
-
-  return $dbh;
-}
-
-sub makeDir {
-    my ($dir) = @_;
-    system("mkdir -p $dir") unless (-e $dir);
-}
-
-sub getCladeToProject {
-    my %cladeToProject = (
-        ALVE => "CryptoDB",
-        APIC => "CryptoDB",
-        HAEM => "PlasmoDB",
-        PIRO => "PiroplasmaDB",
-        AMOE => "AmoebaDB",
-        EUGL => "TriTrypDB",
-        FUNG => "FungiDB",
-        MICR => "MicrosporidiaDB",
-        BASI => "FungiDB",
-        ASCO => "FungiDB",
-        MUCO => "FungiDB",
-        CHYT => "FungiDB",
-	ARTH => "VectorBase",
-	MAMM => "HostDB",
-        OOMY => "FungiDB",
-	ACTI => "none",
-	ARCH => "none",
-	VIRI => "none",
-	CHOR => "none",
-	CHLO => "none",
-	CILI => "none",
-	CREN => "none",
-	CRYP => "none",
-	EURY => "none",
-	FIRM => "none",
-	KORA => "none",
-	FIRM => "none",
-	NEMA => "none",
-	OBAC => "none",
-	PROG => "none",
-	PROA => "none",
-	PROB => "none",
-	PROD => "none",
-	PROE => "none",
-	RHOD => "none",
-	STRE => "none",
-	TUNI => "none"
-        );
-    return \%cladeToProject;
-}

From 692c42f67d7cc3c8a2f196d7332086262b458fa2 Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 4 Mar 2026 13:28:19 -0500
Subject: [PATCH 4/7] Add group/cluster stats and length score to EC assignment
 output

Add cluster_mean_length and length_vs_cluster_mean (fractional deviation
from cluster mean) columns, plus group-level counts: group_size,
n_annotated_in_group, n_supporting_in_group.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/bin/assignEcByOrthologs.pl | 37 +++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl
index 77dc1313b..a0fa3b527 100755
--- a/Load/bin/assignEcByOrthologs.pl
+++ b/Load/bin/assignEcByOrthologs.pl
@@ -6,6 +6,7 @@
 use lib "$ENV{GUS_HOME}/lib/perl";
 
 use Getopt::Long;
+use List::Util qw(sum);
 use DBI;
 use GUS::Supported::GusConfig;
 
@@ -175,7 +176,12 @@
     n_supporting
     n_annotated_in_cluster
     cluster_size
+    cluster_mean_length
+    length_vs_cluster_mean
     cluster_profile
+    group_size
+    n_annotated_in_group
+    n_supporting_in_group
 )), "\n";
 
 # --- Process each group ---
@@ -192,6 +198,21 @@
         push @{ $clusters{$key} }, $aaid;
     }
 
+    # Group-level stats (computed once per group, used in every output row).
+    # These are derived solely from %protein_ec, which is loaded from the DB
+    # at startup and never modified during processing, so these counts always
+    # reflect original (pre-transitive) annotations only.
+    my $group_size         = scalar @members;
+    my $n_annotated_group  = scalar grep { $protein_ec{$_} && @{ $protein_ec{$_} } } @members;
+    my %group_ec_count;   # ec => count of proteins in group originally carrying it
+    for my $aaid (@members) {
+        my %seen;
+        for my $ec (@{ $protein_ec{$aaid} // [] }) {
+            next if $seen{$ec}++;
+            $group_ec_count{$ec}++;
+        }
+    }
+
     # Report which cluster the focus protein landed in
     if ($verbose && $source_id) {
         my ($focus_aaid) = grep { $protein_info{$_}{source_id} eq $source_id } @members;
@@ -239,6 +260,9 @@
         my $cluster_size    = scalar @cluster;
         my $display_profile = $profile_key eq '' ? 'none' : $profile_key;
 
+        my @cluster_lengths = map { $protein_info{$_}{protein_length} } @cluster;
+        my $cluster_mean = sum(@cluster_lengths) / $cluster_size;
+
         # Count support per EC (post-normalization)
         my %ec_support;
         for my $aaid (keys %protein_ecs_in_cluster) {
@@ -331,7 +355,11 @@
                 my $score   = $pass->{score};
                 my $is_novel = $existing_ecs{$ec} ? 0 : 1;
 
-                printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%s\n",
+                my $len_vs_mean = $cluster_mean > 0
+                    ? ($info->{protein_length} - $cluster_mean) / $cluster_mean
+                    : 0;
+
+                printf $OUT "%s\t%s\t%s\t%d\t%s\t%d\t%.4f\t%d\t%d\t%d\t%d\t%.4f\t%s\t%d\t%d\t%d\n",
                     $gid,
                     $aaid,
                     $info->{source_id},
@@ -342,7 +370,12 @@
                     $support,
                     $n_annotated,
                     $cluster_size,
-                    $display_profile;
+                    int($cluster_mean + 0.5),
+                    $len_vs_mean,
+                    $display_profile,
+                    $group_size,
+                    $n_annotated_group,
+                    $group_ec_count{$ec} // 0;
 
                 $rows_written++;
             }

From 356161f6a76e5bc4ed685deeb89b69a607c4a9da Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 4 Mar 2026 13:45:58 -0500
Subject: [PATCH 5/7] Update InsertEcMappingFromOrtho to match new
 assignEcByOrthologs TSV format

- Read aa_sequence_id directly from file (no identifier lookup needed)
- Populate new cluster/group stat columns: num_supporting_cluster,
  num_protein_cluster, num_any_ec_cluster, num_supporting_group,
  num_protein_group, num_any_ec_group, length_mean
- Map confidence_score -> domain_score, length_vs_cluster_mean -> length_score
- Load all rows (both novel and already-annotated)
- Remove unused imports and aaSeqLocusTagMappingSql argument

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/plugin/perl/InsertEcMappingFromOrtho.pm | 246 ++++++++++---------
 1 file changed, 124 insertions(+), 122 deletions(-)

diff --git a/Load/plugin/perl/InsertEcMappingFromOrtho.pm b/Load/plugin/perl/InsertEcMappingFromOrtho.pm
index c09698b76..3890a2350 100644
--- a/Load/plugin/perl/InsertEcMappingFromOrtho.pm
+++ b/Load/plugin/perl/InsertEcMappingFromOrtho.pm
@@ -1,9 +1,8 @@
 #^^^^^^^^^^^^^^^^^^^^^^^^^ End GUS4_STATUS ^^^^^^^^^^^^^^^^^^^^
 ##                 InsertEcMappingFromOrtho.pm
 ##
-## Creates new entries in the table DoTS.AASequenceEnzymeClass to represent
-## the EC mappings found in a tab delimited file of the form EC number, alias
-## $Id$
+## Creates new entries in the table DoTS.AASequenceEnzymeClass from the
+## TSV output produced by assignEcByOrthologs.pl
 ##
 #######################################################################
 
@@ -14,88 +13,80 @@ use strict 'vars';
 
 use GUS::PluginMgr::Plugin;
 use lib "$ENV{GUS_HOME}/lib/perl";
-use FileHandle;
-use Carp;
 use GUS::Model::DoTS::AASequenceEnzymeClass;
 use GUS::Model::SRes::EnzymeClass;
-use GUS::Model::DoTS::TranslatedAAFeature;
-use GUS::Model::DoTS::GeneFeature;
-#use GUS::Model::DoTS::NAFeatureNaGene;
-use GUS::Model::DoTS::NAGene;
 
 
 my $purposeBrief = <<PURPOSEBRIEF;
-Creates new entries in table DoTS.AASequenceEnzymeClass to represent new aa sequence/enzyme class associations.
+Creates new entries in table DoTS.AASequenceEnzymeClass from transitive EC
+assignments produced by assignEcByOrthologs.pl.
 PURPOSEBRIEF
 
 my $purpose = <<PLUGIN_PURPOSE;
-Takes in a tab delimited file of the order EC number, identifier, and creates new entries in table DoTS.AASequenceEnzymeClass to represent new aa sequence/enzyme class associations.  If the identifier is not the primary identifier, we will map the EC number to the primary identifier via the NAGene table, which houses gene aliases.  The mapping then will be NAGene to NAFeatureNAGene to Transcript.
+Reads the TSV output of assignEcByOrthologs.pl and inserts EC assignments
+into DoTS.AASequenceEnzymeClass.  The aa_sequence_id is taken directly from
+the file, so no identifier lookup is required.  Cluster- and group-level
+statistics from the file are stored alongside each assignment.
 PLUGIN_PURPOSE
 
 my $tablesAffected =
-	[['DoTS.AASequenceEnzymeClass', 'The entries representing the new aa_sequence/enzyme class mappings are created here']];
+    [['DoTS.AASequenceEnzymeClass', 'New aa_sequence/enzyme class mappings are inserted here']];
 
-my $tablesDependedOn = [['SRes::EnzymeClass','The EC Numbers from the EC mapping file must have entries in this table to be considered legitimate'],['DoTS::TranslatedAAFeature','The sequences mapped to EC Numbers by the EC mapping file must have entries in this table'],['DoTS::GeneFeature','This table may contain the source ID that will be used to map from the Pfid provided by the mapping file to the entry in DoTS::TranslatedAAFeature'],['DoTS::NAFeatureNaGene','If DoTS::GeneFeature does not contain the Pfid from the mapping file as a source ID, we will need to use this table to check synonyms so that we can map the Pfid to the entry in DoTS::TranslatedAAFeature'],['DoTS::NAGene','If DoTS::GeneFeature does not contain the Pfid from the mapping file as a source ID, we will need to check this table for synonyms so that we can map the Pfid to the entry in DoTS::TranslatedAAFeature']];
+my $tablesDependedOn =
+    [['SRes.EnzymeClass',     'EC numbers must already exist in this table'],
+     ['DoTS.AASequenceImp',   'aa_sequence_id values must already exist in this table']];
 
 my $howToRestart = <<PLUGIN_RESTART;
-There is currently no restart method
+Re-running the plugin is safe: existing rows are detected via retrieveFromDB
+and skipped.
 PLUGIN_RESTART
 
 my $failureCases = <<PLUGIN_FAILURE_CASES;
-There are no known failure cases
+Rows whose EC number is not present in SRes.EnzymeClass are skipped and logged.
 PLUGIN_FAILURE_CASES
 
 my $notes = <<PLUGIN_NOTES;
-
+Both novel (is_novel=1) and already-annotated (is_novel=0) rows are loaded.
 PLUGIN_NOTES
 
-my $documentation = { purpose=>$purpose,
-		      purposeBrief=>$purposeBrief,
-		      tablesAffected=>$tablesAffected,
-		      tablesDependedOn=>$tablesDependedOn,
-		      howToRestart=>$howToRestart,
-		      failureCases=>$failureCases,
-		      notes=>$notes
-		    };
-
+my $documentation = { purpose          => $purpose,
+                      purposeBrief     => $purposeBrief,
+                      tablesAffected   => $tablesAffected,
+                      tablesDependedOn => $tablesDependedOn,
+                      howToRestart     => $howToRestart,
+                      failureCases     => $failureCases,
+                      notes            => $notes
+                    };
 
-my $argsDeclaration = 
+my $argsDeclaration =
   [
-   fileArg({name => 'ECMappingFile',
-	  descr => 'pathname for the file containing the EC mapping data',
-	  constraintFunc => undef,
-	  reqd => 1,
-	  isList => 0,
-	  mustExist => 1,
-	  format => 'Two column tab delimited file in the order EC number, identifier'
-        }),
-   stringArg({name => 'evidenceCode',
-	      descr => 'the evidence code with which data should be entered into the AASequenceEnzymeClass table',
-	      reqd => 1,
-	      constraintFunc => undef,
-	      isList => 0,
-	     }), 
-   stringArg({name => 'aaSeqLocusTagMappingSql',
-              descr => 'sql which returns aa_sequence_id(s) for a given identifier in the EC mapping file. Use a question mark as a macro for where the id should be interpolated into the sql string.  Id will most likely be a locus tag',
-	      reqd => 1,
-	      constraintFunc => undef,
-	      isList => 0,
-	     })
-
+   fileArg({ name           => 'ECMappingFile',
+             descr          => 'TSV file produced by assignEcByOrthologs.pl',
+             constraintFunc => undef,
+             reqd           => 1,
+             isList         => 0,
+             mustExist      => 1,
+             format         => 'Tab-delimited with header row'
+           }),
+   stringArg({ name           => 'evidenceCode',
+               descr          => 'Evidence code to store in AASequenceEnzymeClass (e.g. OrthoMCLDerived)',
+               reqd           => 1,
+               constraintFunc => undef,
+               isList         => 0,
+             }),
   ];
 
 sub new {
     my ($class) = @_;
     my $self = {};
-    bless($self,$class);
-
-    $self->initialize({requiredDbVersion => 4.0,
-		       cvsRevision => '$Revision: 24836 $', # cvs fills this in!
-		       name => ref($self),
-		       argsDeclaration => $argsDeclaration,
-		       documentation => $documentation
-		      });
-
+    bless($self, $class);
+
+    $self->initialize({ requiredDbVersion => 4.0,
+                        cvsRevision       => '$Revision$',
+                        name              => ref($self),
+                        argsDeclaration   => $argsDeclaration,
+                        documentation     => $documentation
+                      });
     return $self;
 }
 
@@ -107,87 +98,98 @@ sub run {
     my ($self) = @_;
 
     my $mappingFile = $self->getArg('ECMappingFile');
-    my $evidCode = $self->getArg('evidenceCode');
-    $self->getAlgInvocation()->setMaximumNumberOfObjects(100000);
-
-    my $queryHandle = $self->getQueryHandle();
-
-    my %ecNumbers;
-    my %aaSeqIds;
+    my $evidCode    = $self->getArg('evidenceCode');
 
-    open (ECMAP, "$mappingFile") || die "Can't open the file $mappingFile.  Reason: $!\n";
-    while (my $line = <ECMAP>) {
-	chomp($line);
-	my @row = split('\t',$line);
-	next if (scalar @row != 9);
-	my ($id,$ec,$numProteinsWithEc,$numTotalProteins,$numGeneraWithEc,$numTotalGenera,$lengthScore,$blastScore,$domainScore) = @row; 
-
-	my $enzymeClass = $self->getEnzymeClass($ec, \%ecNumbers);
-        if (!$enzymeClass) {
-            print "This EC does not exist\n";
-            next;
-        }
+    $self->getAlgInvocation()->setMaximumNumberOfObjects(100000);
 
-        my $aaSeqId = $self->getAASeqId($id, $queryHandle);
-        if (!$aaSeqId) {
-            print "This aaSeqId does not exist\n";
+    my %ecCache;   # ec_number => enzyme_class_id
+    my ($inserted, $skipped_existing, $skipped_no_ec) = (0, 0, 0);
+
+    open(my $FH, '<', $mappingFile) or die "Cannot open '$mappingFile': $!\n";
+
+    my $header = <$FH>;   # skip header line
+
+    while (my $line = <$FH>) {
+        chomp $line;
+        my @row = split('\t', $line);
+
+        # Expected columns (0-based):
+        #  0  group_id
+        #  1  aa_sequence_id
+        #  2  source_id
+        #  3  protein_length
+        #  4  assigned_ec_number
+        #  5  is_novel
+        #  6  confidence_score
+        #  7  n_supporting
+        #  8  n_annotated_in_cluster
+        #  9  cluster_size
+        # 10  cluster_mean_length
+        # 11  length_vs_cluster_mean
+        # 12  cluster_profile
+        # 13  group_size
+        # 14  n_annotated_in_group
+        # 15  n_supporting_in_group
+
+        next unless scalar @row == 16;
+
+        my ($group_id, $aa_sequence_id, $source_id, $protein_length,
+            $ec_number, $is_novel, $confidence_score,
+            $n_supporting, $n_annotated_in_cluster, $cluster_size,
+            $cluster_mean_length, $length_vs_cluster_mean, $cluster_profile,
+            $group_size, $n_annotated_in_group, $n_supporting_in_group) = @row;
+
+        my $enzyme_class_id = $self->getEnzymeClassId($ec_number, \%ecCache);
+        unless ($enzyme_class_id) {
+            $self->log("WARNING: EC number '$ec_number' not found in SRes.EnzymeClass — skipping $source_id");
+            $skipped_no_ec++;
             next;
         }
 
-	$self->log("Processing Pfid: $id, ECNumber: $ec");
-	
-        my $newAASeqEnzClass =  GUS::Model::DoTS::AASequenceEnzymeClass->new({
-                'aa_sequence_id' => $aaSeqId,
-		'enzyme_class_id' => $enzymeClass,
-		'evidence_code' => $evidCode,
-		'num_protein_with_ec' => $numProteinsWithEc,
-		'num_protein_in_group' => $numTotalProteins,
-		'num_genera_with_ec' => $numGeneraWithEc,
-		'num_genera_in_group' => $numTotalGenera,
-		'length_score' => $lengthScore,
-		'blast_score' => $blastScore,
-		'domain_score' => $domainScore
+        my $newRow = GUS::Model::DoTS::AASequenceEnzymeClass->new({
+            aa_sequence_id         => $aa_sequence_id,
+            enzyme_class_id        => $enzyme_class_id,
+            evidence_code          => $evidCode,
+            domain_score           => $confidence_score,
+            length_score           => $length_vs_cluster_mean,
+            length_mean            => $cluster_mean_length,
+            num_supporting_cluster => $n_supporting,
+            num_protein_cluster    => $cluster_size,
+            num_any_ec_cluster     => $n_annotated_in_cluster,
+            num_supporting_group   => $n_supporting_in_group,
+            num_protein_group      => $group_size,
+            num_any_ec_group       => $n_annotated_in_group,
         });
-        if (! $newAASeqEnzClass->retrieveFromDB()) {
-	      $newAASeqEnzClass->submit();
-	      $self->log("  submitted enzyme $enzymeClass, seq $aaSeqId");
+
+        if ($newRow->retrieveFromDB()) {
+            $skipped_existing++;
+        } else {
+            $newRow->submit();
+            $self->log("Inserted EC=$ec_number for $source_id (aa_sequence_id=$aa_sequence_id, group=$group_id)");
+            $inserted++;
         }
-	$self->undefPointerCache();
-    }
-    return "Finished processing EC Mapping file\n";
-}
 
-###### FETCH THE EC ID FOR A GIVEN EC NUMBER ######
-sub getEnzymeClass {
-    my ($self, $ecNumber, $ecHash) = @_;
-    if (! exists $ecHash->{$ecNumber}) {
-	my $newEnzymeClass =  GUS::Model::SRes::EnzymeClass->new({
-	    'ec_number' => $ecNumber
-								 });
-	$newEnzymeClass->retrieveFromDB();
-	$ecHash->{$ecNumber} = $newEnzymeClass->getId();
+        $self->undefPointerCache();
     }
-    return $ecHash->{$ecNumber};
+
+    close($FH);
+
+    return "Inserted $inserted rows. Skipped: $skipped_existing already in DB, $skipped_no_ec unknown EC.\n";
 }
 
-###### FETCH THE AA SEQUNCE ID FOR A GIVEN ALIAS ######
-sub getAASeqId {
-    my ($self, $id, $dbh) = @_;
-    my $sql = "SELECT aa_sequence_id from dots.externalaasequence where source_id = '$id'";
-    my $sth = $dbh->prepare($sql);
-    $sth->execute();
-    my $aaSeqId;
-    while (my $aaSequenceId = $sth->fetchrow_array()) {
-        $aaSeqId = $aaSequenceId;
+sub getEnzymeClassId {
+    my ($self, $ecNumber, $cache) = @_;
+    unless (exists $cache->{$ecNumber}) {
+        my $ec = GUS::Model::SRes::EnzymeClass->new({ ec_number => $ecNumber });
+        $ec->retrieveFromDB();
+        $cache->{$ecNumber} = $ec->getId();
     }
-    $sth->finish();
-    return $aaSeqId;
+    return $cache->{$ecNumber};
 }
 
 sub undoTables {
-  my ($self) = @_;
-
-  return ('DoTS.AASequenceEnzymeClass');
+    my ($self) = @_;
+    return ('DoTS.AASequenceEnzymeClass');
 }
 
 1;

From c4e4e1b6c71b83a66dc36765e03b0e4b5683fd92 Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 4 Mar 2026 14:00:53 -0500
Subject: [PATCH 6/7] Ensure correct numeric types when loading EC mapping data

Round domain_score and length_score to 4 decimal places, cast
length_mean to integer, and explicitly int() all count columns.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/plugin/perl/InsertEcMappingFromOrtho.pm | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Load/plugin/perl/InsertEcMappingFromOrtho.pm b/Load/plugin/perl/InsertEcMappingFromOrtho.pm
index 3890a2350..0b9b52963 100644
--- a/Load/plugin/perl/InsertEcMappingFromOrtho.pm
+++ b/Load/plugin/perl/InsertEcMappingFromOrtho.pm
@@ -150,15 +150,15 @@ sub run {
             aa_sequence_id         => $aa_sequence_id,
             enzyme_class_id        => $enzyme_class_id,
             evidence_code          => $evidCode,
-            domain_score           => $confidence_score,
-            length_score           => $length_vs_cluster_mean,
-            length_mean            => $cluster_mean_length,
-            num_supporting_cluster => $n_supporting,
-            num_protein_cluster    => $cluster_size,
-            num_any_ec_cluster     => $n_annotated_in_cluster,
-            num_supporting_group   => $n_supporting_in_group,
-            num_protein_group      => $group_size,
-            num_any_ec_group       => $n_annotated_in_group,
+            domain_score           => sprintf("%.4f", $confidence_score),
+            length_score           => sprintf("%.4f", $length_vs_cluster_mean),
+            length_mean            => int($cluster_mean_length + 0.5),
+            num_supporting_cluster => int($n_supporting),
+            num_protein_cluster    => int($cluster_size),
+            num_any_ec_cluster     => int($n_annotated_in_cluster),
+            num_supporting_group   => int($n_supporting_in_group),
+            num_protein_group      => int($group_size),
+            num_any_ec_group       => int($n_annotated_in_group),
         });
 
         if ($newRow->retrieveFromDB()) {

From eb051812dddf5217e947c7c93dbd99cdf97423d0 Mon Sep 17 00:00:00 2001
From: John Brestelli <jbrestell@gmail.com>
Date: Wed, 11 Mar 2026 14:18:06 -0400
Subject: [PATCH 7/7] Skip single-gene clusters and already-annotated EC
 assignments

Only output rows for novel EC assignments (genes missing the EC) and
only process interpro clusters with more than one gene.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Load/bin/assignEcByOrthologs.pl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Load/bin/assignEcByOrthologs.pl b/Load/bin/assignEcByOrthologs.pl
index a0fa3b527..80a2cb76e 100755
--- a/Load/bin/assignEcByOrthologs.pl
+++ b/Load/bin/assignEcByOrthologs.pl
@@ -258,6 +258,7 @@
         next if $n_annotated == 0;   # nothing to propagate from
 
         my $cluster_size    = scalar @cluster;
+        next if $cluster_size == 1;
         my $display_profile = $profile_key eq '' ? 'none' : $profile_key;
 
         my @cluster_lengths = map { $protein_info{$_}{protein_length} } @cluster;
@@ -354,6 +355,7 @@
                 my $support = $pass->{support};
                 my $score   = $pass->{score};
                 my $is_novel = $existing_ecs{$ec} ? 0 : 1;
+                next unless $is_novel;
 
                 my $len_vs_mean = $cluster_mean > 0
                     ? ($info->{protein_length} - $cluster_mean) / $cluster_mean