nf-core · vagkaratzas · Mar 31, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/modules/nf-core/caalm/caalm/environment.yml b/modules/nf-core/caalm/caalm/environment.yml
@@ -0,0 +1,11 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::faiss-cpu=1.10.0
+  - conda-forge::python=3.10.0
+  - pip:
+      - caalm==1.0.0
+      - torch==2.6.0
diff --git a/modules/nf-core/caalm/caalm/main.nf b/modules/nf-core/caalm/caalm/main.nf
@@ -0,0 +1,63 @@
+process CAALM_CAALM {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/84/840046fbd06b709533c0f9443a9ab04663012feb074ebca60067c0adb76baa21/data':
+        'community.wave.seqera.io/library/faiss-cpu_python_pip_caalm_torch:c3008a34cb7c94b7' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    tuple path(level0), path(level1), path(level2)
+
+    output:
+    tuple val(meta), path("${prefix}_predictions.tsv")      , emit: predictions
+    tuple val(meta), path("${prefix}_probabilities.jsonl")  , emit: probabilities
+    tuple val(meta), path("${prefix}_statistics.tsv")       , emit: statistics
+    tuple val(meta), path("${prefix}_level0_embeddings.npy"), emit: embeddings_level0, optional: true
+    tuple val(meta), path("${prefix}_level1_embeddings.npy"), emit: embeddings_level1, optional: true
+    tuple val(meta), path("${prefix}_level2_embeddings.npy"), emit: embeddings_level2, optional: true
+    tuple val(meta), path("${prefix}.log")                  , emit: log
+    tuple val("${task.process}"), val('caalm'), eval("caalm --version 2>&1 | head -1"), topic: versions, emit: versions_caalm
+    tuple val("${task.process}"), val('python'), eval("python --version | sed 's/Python //'"), topic: versions, emit: versions_python
+    tuple val("${task.process}"), val('torch'), eval("python -c 'import torch; print(torch.__version__)'"), topic: versions, emit: versions_torch
+    tuple val("${task.process}"), val('faiss'), eval("python -c 'import faiss; print(faiss.__version__)'"), topic: versions, emit: versions_faiss
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    caalm \\
+        $args \\
+        --num-workers ${task.cpus} \\
+        --level0-model ${level0} \\
+        --level1-model ${level1} \\
+        --level2-model ${level2}/model.pt \\
+        --level2-faiss-dir ${level2}/faiss \\
+        --level2-label-tsv-dir ${level2}/refdb \\
+        --output-name ${prefix} \\
+        -o . \\
+        ${fasta} \\
+        > ${prefix}.log
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo "$args"
+
+    touch ${prefix}_predictions.tsv
+    touch ${prefix}_probabilities.jsonl
+    touch ${prefix}_statistics.tsv
+    touch ${prefix}.log
+
+    if [[ "$args" == *"--save-level0-embeddings"* ]]; then touch ${prefix}_level0_embeddings.npy; fi
+    if [[ "$args" == *"--save-level1-embeddings"* ]]; then touch ${prefix}_level1_embeddings.npy; fi
+    if [[ "$args" == *"--save-level2-embeddings"* ]]; then touch ${prefix}_level2_embeddings.npy; fi
+    """
+}
diff --git a/modules/nf-core/caalm/caalm/meta.yml b/modules/nf-core/caalm/caalm/meta.yml
@@ -0,0 +1,238 @@
+name: "caalm_caalm"
+description: |
+  Annotates carbohydrate-active enzyme (CAZyme) families from protein sequences
+  using protein language model (ESM) embeddings and FAISS-based nearest-neighbour
+  search. Performs three-level hierarchical classification: binary CAZyme detection
+  (Level 0), CAZy class assignment (Level 1), and CAZy family assignment (Level 2).
+keywords:
+  - cazyme
+  - annotation
+  - protein
+  - language model
+  - deep learning
+  - classification
+tools:
+  - "caalm":
+      description: |
+        CAALM (Carbohydrate Activity Annotation with protein Language Models) predicts
+        CAZyme class and family membership from protein FASTA sequences using ESM-based
+        embeddings and FAISS nearest-neighbour retrieval.
+      homepage: "https://github.com/lczong/CAALM"
+      documentation: "https://github.com/lczong/CAALM"
+      tool_dev_url: "https://github.com/lczong/CAALM"
+      licence:
+        - "MIT"
+      identifier: ""
+
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - fasta:
+        type: file
+        description: Protein sequences in FASTA format.
+        pattern: "*.{fa,fasta,faa}"
+        ontologies:
+          - edam: http://edamontology.org/format_1929 # FASTA
+  - - level0:
+        type: directory
+        description: |
+          Directory containing the Level 0 (binary CAZyme detection) model files,
+          as produced by CAALM_DOWNLOADMODELS.
+        pattern: "models/level0"
+        ontologies: []
+    - level1:
+        type: directory
+        description: |
+          Directory containing the Level 1 (CAZy class assignment) model files,
+          as produced by CAALM_DOWNLOADMODELS.
+        pattern: "models/level1"
+        ontologies: []
+    - level2:
+        type: directory
+        description: |
+          Directory containing the Level 2 (CAZy family assignment) model files,
+          as produced by CAALM_DOWNLOADMODELS.
+        pattern: "models/level2"
+        ontologies: []
+
+output:
+  predictions:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_predictions.tsv:
+          type: file
+          description: |
+            Tab-separated file with per-sequence CAZy class and family predictions
+            across all three classification levels.
+          pattern: "*_predictions.tsv"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  probabilities:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_probabilities.jsonl:
+          type: file
+          description: |
+            JSON Lines file with per-sequence probability scores at all three
+            classification levels.
+          pattern: "*_probabilities.jsonl"
+          ontologies:
+            - edam: http://edamontology.org/format_3464 # JSON
+  statistics:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_statistics.tsv:
+          type: file
+          description: |
+            Tab-separated summary file with counts and percentages of predicted
+            CAZyme classes and families across the input sequences.
+          pattern: "*_statistics.tsv"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  embeddings_level0:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_level0_embeddings.npy:
+          type: file
+          description: |
+            NumPy array file containing Level 0 ESM embeddings for each input
+            sequence. Optional; produced when `--save-level0-embeddings` is passed.
+          pattern: "*_level0_embeddings.npy"
+          ontologies: []
+  embeddings_level1:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_level1_embeddings.npy:
+          type: file
+          description: |
+            NumPy array file containing Level 1 projected embeddings for each input
+            sequence. Optional; produced when `--save-level1-embeddings` is passed.
+          pattern: "*_level1_embeddings.npy"
+          ontologies: []
+  embeddings_level2:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}_level2_embeddings.npy:
+          type: file
+          description: |
+            NumPy array file containing Level 2 projected embeddings for each input
+            sequence. Optional; produced when `--save-level2-embeddings` is passed.
+          pattern: "*_level2_embeddings.npy"
+          ontologies: []
+  log:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - ${prefix}.log:
+          type: file
+          description: Log file containing the stdout and stderr output of the caalm run.
+          pattern: "*.log"
+          ontologies:
+            - edam: http://edamontology.org/format_2330 # Textual format
+  versions_caalm:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - caalm:
+          type: string
+          description: The name of the tool
+      - caalm --version 2>&1 | head -1:
+          type: eval
+          description: The expression to obtain the version of the tool
+  versions_python:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - python:
+          type: string
+          description: The name of the tool
+      - "python --version | sed 's/Python //'":
+          type: eval
+          description: The expression to obtain the version of the tool
+  versions_torch:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - torch:
+          type: string
+          description: The name of the tool
+      - "python -c 'import torch; print(torch.__version__)'":
+          type: eval
+          description: The expression to obtain the version of the tool
+  versions_faiss:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - faiss:
+          type: string
+          description: The name of the tool
+      - "python -c 'import faiss; print(faiss.__version__)'":
+          type: eval
+          description: The expression to obtain the version of the tool
+
+topics:
+  versions:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - caalm:
+          type: string
+          description: The name of the tool
+      - caalm --version 2>&1 | head -1:
+          type: eval
+          description: The expression to obtain the version of the tool
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - python:
+          type: string
+          description: The name of the tool
+      - "python --version | sed 's/Python //'":
+          type: eval
+          description: The expression to obtain the version of the tool
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - torch:
+          type: string
+          description: The name of the tool
+      - "python -c 'import torch; print(torch.__version__)'":
+          type: eval
+          description: The expression to obtain the version of the tool
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - faiss:
+          type: string
+          description: The name of the tool
+      - "python -c 'import faiss; print(faiss.__version__)'":
+          type: eval
+          description: The expression to obtain the version of the tool
+
+authors:
+  - "@vagkaratzas"
+maintainers:
+  - "@vagkaratzas"