Sorry for this newbie question, but I’m a little confused on how the Snakefile actually works. I tried to use the Snakefile linked from the Zika tutorial (zika-tutorial/Snakefile at master · nextstrain/zika-tutorial · GitHub) as a starter. But I’m getting an error on augur index
which is the first command run:
Building DAG of jobs...
MissingInputException in rule index_sequences in file /nextstrain/build/Snakefile, line 13:
Missing input files for rule index_sequences:
output: results/sequence_index.tsv
affected files:
../data/Genotype_3.fa
Here’s my Snakefile:
(I also tried adding --output {output.sequence_index}
to augur index
, and also to use full path to the input file)
rule all:
input:
auspice_json = "auspice/Genotype_3.json",
input_fasta = "../data/Genotype_3.fa",
input_metadata = "../data/Genotype_3.metadata.tsv",
dropped_strains = "config/dropped_strains.txt",
reference = "../data/Genotype_3.reference.gb",
colors = "config/colors.tsv",
lat_longs = "config/lat_longs.tsv",
auspice_config = "config/auspice_config.json"
rule index_sequences:
message:
"""
Creating an index of sequence composition for filtering.
"""
input:
sequences = input_fasta
output:
sequence_index = "results/sequence_index.tsv"
shell:
"""
augur index \
--sequences {input.sequences} \
"""
rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- from {params.min_date} onwards
- excluding strains in {input.exclude}
"""
input:
sequences = input_fasta,
sequence_index = "results/sequence_index.tsv",
metadata = input_metadata,
exclude = dropped_strains
output:
sequences = "results/filtered.fasta"
params:
group_by = "country year",
sequences_per_group = 100,
min_date = 1900
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-date {params.min_date}
"""
rule align:
message:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = rules.filter.output.sequences,
reference = reference
output:
alignment = "results/aligned.fasta"
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--fill-gaps
"""
rule tree:
message: "Building tree"
input:
alignment = rules.align.output.alignment
output:
tree = "results/tree_raw.nwk"
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree}
"""
rule refine:
message:
"""
Refining tree
- estimate timetree
- use {params.coalescent} coalescent timescale
- estimate {params.date_inference} node dates
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
"""
input:
tree = rules.tree.output.tree,
alignment = rules.align.output,
metadata = input_metadata
output:
tree = "results/tree.nwk",
node_data = "results/branch_lengths.json"
params:
coalescent = "opt",
date_inference = "marginal",
clock_filter_iqd = 4
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
--clock-filter-iqd {params.clock_filter_iqd}
"""
rule ancestral:
message: "Reconstructing ancestral sequences and mutations"
input:
tree = rules.refine.output.tree,
alignment = rules.align.output
output:
node_data = "results/nt_muts.json"
params:
inference = "joint"
shell:
"""
augur ancestral \
--tree {input.tree} \
--alignment {input.alignment} \
--output-node-data {output.node_data} \
--inference {params.inference}
"""
rule translate:
message: "Translating amino acid sequences"
input:
tree = rules.refine.output.tree,
node_data = rules.ancestral.output.node_data,
reference = reference
output:
node_data = "results/aa_muts.json"
shell:
"""
augur translate \
--tree {input.tree} \
--ancestral-sequences {input.node_data} \
--reference-sequence {input.reference} \
--output-node-data {output.node_data} \
"""
rule traits:
message: "Inferring ancestral traits for {params.columns!s}"
input:
tree = rules.refine.output.tree,
metadata = input_metadata
output:
node_data = "results/traits.json",
params:
columns = "region country"
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--output-node-data {output.node_data} \
--columns {params.columns} \
--confidence
"""
rule export:
message: "Exporting data files for for auspice"
input:
tree = rules.refine.output.tree,
metadata = input_metadata,
branch_lengths = rules.refine.output.node_data,
traits = rules.traits.output.node_data,
nt_muts = rules.ancestral.output.node_data,
aa_muts = rules.translate.output.node_data,
colors = colors,
lat_longs = lat_longs,
auspice_config = auspice_config
output:
auspice_json = rules.all.input.auspice_json,
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--auspice-config {input.auspice_config} \
--include-root-sequence \
--output {output.auspice_json}
"""
rule clean:
message: "Removing directories: {params}"
params:
"results ",
"auspice"
shell:
"rm -rfv {params}"