conda create -n semibin_tutorial python=3.8
conda install -n semibin_tutorial -c conda-forge -c bioconda semibin
conda install -c bioconda bowtie2
conda install -c bioconda samtools
conda install -c bioconda megahit
Assembly
megahit -1 sample1_R1.fastq.gz \
-2 sample1_R2.fastq.gz \
--out-dir assembly_contig \
--out-prefix R1
Mapping
bowtie2-build \
-f assembly_contig/R1.contigs.fa assembly_contig/R1.contigs.fa
bowtie2 -q --fr \
-x assembly_contig/R1.contigs.fa \
-1 sample1_R1.fastq.gz \
-2 sample1_R2.fastq.gz \
-S sample1.sam \
-p 64
samtools view -h -b -S sample1.sam -o sample1.bam -@ 64
samtools view -b -F 4 sample1.bam -o sample1.mapped.bam -@ 64
samtools sort \
-m 1000000000 sample1.mapped.bam \
-o sample1.mapped.sorted.bam -@ 64
SemiBin single_easy_bin \
-i assembly_contig/R1.contigs.fa \
-b sample1.mapped.sorted.bam \
-o easy_single_sample_output
or with pretrained model (for example: sample from human gut)
SemiBin single_easy_bin \
-i assembly_contig/R1.contigs.fa \
-b sample1.mapped.sorted.bam \
-o easy_single_sample_output_pretrain \
--environment human_gut
(1) Generate data.csv/data_split.csv
SemiBin generate_sequence_features_single \
-i assembly_contig/R1.contigs.fa \
-b sample1.mapped.sorted.bam \
-o advanced_single_sample_output
(2) Generate cannot-link
SemiBin generate_cannot_links \
-i assembly_contig/R1.contigs.fa \
-o advanced_single_sample_output
(3) Train
SemiBin train \
-i assembly_contig/R1.contigs.fa \
--data advanced_single_sample_output/data.csv \
--data-split advanced_single_sample_output/data_split.csv \
-c advanced_single_sample_output/cannot/cannot.txt \
-o advanced_single_sample_output --mode single
(4) Bin
SemiBin bin \
-i assembly_contig/R1.contigs.fa \
--model advanced_single_sample_output/model.h5 \
--data advanced_single_sample_output/data.csv \
-o advanced_single_sample_output
with pretrained model
SemiBin bin \
-i assembly_contig/R1.contigs.fa \
--data advanced_single_sample_output/data.csv \
-o advanced_single_sample_output_pretrain \
--environment human_gut
Assembly
megahit -1 sample1_R1.fastq.gz,sample2_R1.fastq.gz \
-2 sample1_R2.fastq.gz,sample2_R2.fastq.gz \
--out-dir assembly_contig \
--out-prefix coassembly
Mapping
bowtie2-build \
-f assembly_contig/coassembly.contigs.fa assembly_contig/coassembly.contigs.fa
for sample in {1..2}; \
do \
bowtie2 -q --fr \
-x assembly_contig/coassembly.contigs.fa \
-1 sample"$sample"_R1.fastq.gz \
-2 sample"$sample"_R2.fastq.gz \
-S sample"$sample".sam \
-p 64; \
samtools view -h -b -S sample"$sample".sam -o sample"$sample".bam -@ 64; \
samtools view -b -F 4 sample"$sample".bam -o sample"$sample".mapped.bam -@ 64; \
samtools sort \
-m 1000000000 sample"$sample".mapped.bam \
-o sample"$sample".mapped.sorted.bam -@ 64; \
done
SemiBin single_easy_bin \
-i assembly_contig/coassembly.contigs.fa \
-b sample*.mapped.sorted.bam \
-o easy_coassembly_sample_output
(1) Generate data.csv/data_split.csv
SemiBin generate_sequence_features_single \
-i assembly_contig/coassembly.contigs.fa \
-b sample*.mapped.sorted.bam \
-o advanced_coassembly_sample_output
(2) Generate cannot-link
SemiBin generate_cannot_links \
-i assembly_contig/coassembly.contigs.fa \
-o advanced_coassembly_sample_output
(3) Train
SemiBin train \
-i assembly_contig/coassembly.contigs.fa \
--data advanced_coassembly_sample_output/data.csv \
--data-split advanced_coassembly_sample_output/data_split.csv \
-c advanced_coassembly_sample_output/cannot/cannot.txt \
-o advanced_coassembly_sample_output --mode single
(4) Bin
SemiBin bin \
-i assembly_contig/coassembly.contigs.fa \
--model advanced_coassembly_sample_output/model.h5 \
--data advanced_coassembly_sample_output/data.csv \
-o advanced_coassembly_sample_output
Assembly
for sample in {1..5}; \
do \
megahit -1 sample"$sample"_R1.fastq.gz \
-2 sample"$sample"_R2.fastq.gz \
--out-dir assembly_contig"$sample" \
--out-prefix R"$sample"; \
done
Concatenate contigs
SemiBin concatenate_fasta -i assembly_contig*/R*.fa -o combined_output
Mapping
bowtie2-build \
-f combined_output/concatenated.fa combined_output/concatenated.fa
for sample in {1..5}; \
do \
bowtie2 -q --fr \
-x combined_output/concatenated.fa \
-1 sample"$sample"_R1.fastq.gz \
-2 sample"$sample"_R2.fastq.gz \
-S sample"$sample".sam \
-p 64; \
samtools view -h -b -S sample"$sample".sam -o sample"$sample".bam -@ 64; \
samtools view -b -F 4 sample"$sample".bam -o sample"$sample".mapped.bam -@ 64; \
samtools sort \
-m 1000000000 sample"$sample".mapped.bam \
-o sample"$sample".mapped.sorted.bam -@ 64; \
done
SemiBin multi_easy_bin \
-i combined_output/concatenated.fa \
-b sample*.mapped.sorted.bam \
-o easy_multi_sample_output
(1) Generate data.csv/data_split.csv
SemiBin generate_sequence_features_multi \
-i combined_output/concatenated.fa \
-b sample*.mapped.sorted.bam \
-o advanced_multi_output
(2) Generate cannot-link
for i in {1..5}; \
do \
SemiBin generate_cannot_links \
-i assembly_contig"$i"/R"$i".contigs.fa \
-o advanced_multi_output/samples/R"$i"; \
done
(3) Train
for i in {1..5}; \
do \
SemiBin train \
-i assembly_contig"$i"/R"$i".contigs.fa \
--data advanced_multi_output/samples/R"$i"/data.csv \
--data-split advanced_multi_output/samples/R"$i"/data_split.csv \
-c advanced_multi_output/samples/R"$i"/cannot/cannot.txt \
-o advanced_multi_output/samples/R"$i" \
--mode single; \
done
(4) Bin
for i in {1..5}; \
do \
SemiBin bin \
-i assembly_contig"$i"/R"$i".contigs.fa \
--model advanced_multi_output/samples/R"$i"/model.h5 \
--data advanced_multi_output/samples/R"$i"/data.csv \
-o advanced_multi_output/samples/R"$i"; \
done