Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions modules/nf-core/caalm/caalm/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::faiss-cpu=1.10.0
- conda-forge::python=3.10.0
- pip:
- caalm==1.0.0
- torch==2.6.0
63 changes: 63 additions & 0 deletions modules/nf-core/caalm/caalm/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
process CAALM_CAALM {
tag "$meta.id"
label 'process_high'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/84/840046fbd06b709533c0f9443a9ab04663012feb074ebca60067c0adb76baa21/data':
'community.wave.seqera.io/library/faiss-cpu_python_pip_caalm_torch:c3008a34cb7c94b7' }"

input:
tuple val(meta), path(fasta)
tuple path(level0), path(level1), path(level2)

output:
tuple val(meta), path("${prefix}_predictions.tsv") , emit: predictions
tuple val(meta), path("${prefix}_probabilities.jsonl") , emit: probabilities
tuple val(meta), path("${prefix}_statistics.tsv") , emit: statistics
tuple val(meta), path("${prefix}_level0_embeddings.npy"), emit: embeddings_level0, optional: true
tuple val(meta), path("${prefix}_level1_embeddings.npy"), emit: embeddings_level1, optional: true
tuple val(meta), path("${prefix}_level2_embeddings.npy"), emit: embeddings_level2, optional: true
tuple val(meta), path("${prefix}.log") , emit: log
tuple val("${task.process}"), val('caalm'), eval("caalm --version 2>&1 | head -1"), topic: versions, emit: versions_caalm
tuple val("${task.process}"), val('python'), eval("python --version | sed 's/Python //'"), topic: versions, emit: versions_python
tuple val("${task.process}"), val('torch'), eval("python -c 'import torch; print(torch.__version__)'"), topic: versions, emit: versions_torch
tuple val("${task.process}"), val('faiss'), eval("python -c 'import faiss; print(faiss.__version__)'"), topic: versions, emit: versions_faiss

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
caalm \\
$args \\
--num-workers ${task.cpus} \\
--level0-model ${level0} \\
--level1-model ${level1} \\
--level2-model ${level2}/model.pt \\
--level2-faiss-dir ${level2}/faiss \\
--level2-label-tsv-dir ${level2}/refdb \\
--output-name ${prefix} \\
-o . \\
Comment thread
vagkaratzas marked this conversation as resolved.
${fasta} \\
> ${prefix}.log
"""

stub:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
echo "$args"

touch ${prefix}_predictions.tsv
touch ${prefix}_probabilities.jsonl
touch ${prefix}_statistics.tsv
touch ${prefix}.log

if [[ "$args" == *"--save-level0-embeddings"* ]]; then touch ${prefix}_level0_embeddings.npy; fi
if [[ "$args" == *"--save-level1-embeddings"* ]]; then touch ${prefix}_level1_embeddings.npy; fi
if [[ "$args" == *"--save-level2-embeddings"* ]]; then touch ${prefix}_level2_embeddings.npy; fi
"""
}
238 changes: 238 additions & 0 deletions modules/nf-core/caalm/caalm/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
name: "caalm_caalm"
description: |
Annotates carbohydrate-active enzyme (CAZyme) families from protein sequences
using protein language model (ESM) embeddings and FAISS-based nearest-neighbour
search. Performs three-level hierarchical classification: binary CAZyme detection
(Level 0), CAZy class assignment (Level 1), and CAZy family assignment (Level 2).
keywords:
- cazyme
- annotation
- protein
- language model
- deep learning
- classification
tools:
- "caalm":
description: |
CAALM (Carbohydrate Activity Annotation with protein Language Models) predicts
CAZyme class and family membership from protein FASTA sequences using ESM-based
embeddings and FAISS nearest-neighbour retrieval.
homepage: "https://github.com/lczong/CAALM"
documentation: "https://github.com/lczong/CAALM"
tool_dev_url: "https://github.com/lczong/CAALM"
licence:
- "MIT"
identifier: ""

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- fasta:
type: file
description: Protein sequences in FASTA format.
pattern: "*.{fa,fasta,faa}"
ontologies:
- edam: http://edamontology.org/format_1929 # FASTA
- - level0:
type: directory
description: |
Directory containing the Level 0 (binary CAZyme detection) model files,
as produced by CAALM_DOWNLOADMODELS.
pattern: "models/level0"
ontologies: []
- level1:
type: directory
description: |
Directory containing the Level 1 (CAZy class assignment) model files,
as produced by CAALM_DOWNLOADMODELS.
pattern: "models/level1"
ontologies: []
- level2:
type: directory
description: |
Directory containing the Level 2 (CAZy family assignment) model files,
as produced by CAALM_DOWNLOADMODELS.
pattern: "models/level2"
ontologies: []

output:
predictions:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_predictions.tsv:
type: file
description: |
Tab-separated file with per-sequence CAZy class and family predictions
across all three classification levels.
pattern: "*_predictions.tsv"
ontologies:
- edam: http://edamontology.org/format_3475 # TSV
probabilities:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_probabilities.jsonl:
type: file
description: |
JSON Lines file with per-sequence probability scores at all three
classification levels.
pattern: "*_probabilities.jsonl"
ontologies:
- edam: http://edamontology.org/format_3464 # JSON
statistics:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_statistics.tsv:
type: file
description: |
Tab-separated summary file with counts and percentages of predicted
CAZyme classes and families across the input sequences.
pattern: "*_statistics.tsv"
ontologies:
- edam: http://edamontology.org/format_3475 # TSV
embeddings_level0:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_level0_embeddings.npy:
type: file
description: |
NumPy array file containing Level 0 ESM embeddings for each input
sequence. Optional; produced when `--save-level0-embeddings` is passed.
pattern: "*_level0_embeddings.npy"
ontologies: []
embeddings_level1:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_level1_embeddings.npy:
type: file
description: |
NumPy array file containing Level 1 projected embeddings for each input
sequence. Optional; produced when `--save-level1-embeddings` is passed.
pattern: "*_level1_embeddings.npy"
ontologies: []
embeddings_level2:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}_level2_embeddings.npy:
type: file
description: |
NumPy array file containing Level 2 projected embeddings for each input
sequence. Optional; produced when `--save-level2-embeddings` is passed.
pattern: "*_level2_embeddings.npy"
ontologies: []
log:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- ${prefix}.log:
type: file
description: Log file containing the stdout and stderr output of the caalm run.
pattern: "*.log"
ontologies:
- edam: http://edamontology.org/format_2330 # Textual format
versions_caalm:
- - ${task.process}:
type: string
description: The name of the process
- caalm:
type: string
description: The name of the tool
- caalm --version 2>&1 | head -1:
type: eval
description: The expression to obtain the version of the tool
versions_python:
- - ${task.process}:
type: string
description: The name of the process
- python:
type: string
description: The name of the tool
- "python --version | sed 's/Python //'":
type: eval
description: The expression to obtain the version of the tool
versions_torch:
- - ${task.process}:
type: string
description: The name of the process
- torch:
type: string
description: The name of the tool
- "python -c 'import torch; print(torch.__version__)'":
type: eval
description: The expression to obtain the version of the tool
versions_faiss:
- - ${task.process}:
type: string
description: The name of the process
- faiss:
type: string
description: The name of the tool
- "python -c 'import faiss; print(faiss.__version__)'":
type: eval
description: The expression to obtain the version of the tool

topics:
versions:
- - ${task.process}:
type: string
description: The name of the process
- caalm:
type: string
description: The name of the tool
- caalm --version 2>&1 | head -1:
type: eval
description: The expression to obtain the version of the tool
- - ${task.process}:
type: string
description: The name of the process
- python:
type: string
description: The name of the tool
- "python --version | sed 's/Python //'":
type: eval
description: The expression to obtain the version of the tool
- - ${task.process}:
type: string
description: The name of the process
- torch:
type: string
description: The name of the tool
- "python -c 'import torch; print(torch.__version__)'":
type: eval
description: The expression to obtain the version of the tool
- - ${task.process}:
type: string
description: The name of the process
- faiss:
type: string
description: The name of the tool
- "python -c 'import faiss; print(faiss.__version__)'":
type: eval
description: The expression to obtain the version of the tool

authors:
- "@vagkaratzas"
maintainers:
- "@vagkaratzas"
Loading
Loading