1.数据质控kneaddata

#安装质控软件
conda install -y fastqc
#安装宏基因组质控软件
conda install -y kneaddata
#安装失败，先装python3，再装bowtie2
conda install --channel conda-forge python=3.10
conda install -y bowtie2

https://github.com/biobakery/biobakery/wiki/kneaddata

#下载参考基因组--斑马鱼：GCA_000002035.4；新建文件夹host_db存放
#<https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000002035.6/>
#安装curl
sudo apt install curl
#下载
curl "<https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000002035.6/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED>" -o Danio_rerio.GRCz11 
#后台运行
curl "<https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000002035.6/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED>" -o Danio_rerio.GRCz11 -s &
#检查命令运行是否完成和文件大小--未完成--重新运行
(base) zms@shpc-49506-instance-6QslimRj:~/host_db$ ps aux | grep curl
zms         9180  0.0  0.0   8908   720 pts/1    S+   13:40   0:00 grep --color=auto curl
(base) zms@shpc-49506-instance-6QslimRj:~/host_db$ ls -lh
total 14M
-rw-rw-r-- 1 zms zms 14M Mar 11 23:00 Danio_rerio.GRCz11

#文件解压
unzip Danio_rerio.GRCz11 

sudo apt install tree

#创建定制数据库--斑马鱼 <https://www.jianshu.com/p/cc306b201707>
bowtie2-build --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
#查看进度
(meta) zms@shpc-49506-instance-6QslimRj:~$ ps aux|grep bowtie2
zms         9728  0.0  0.0  21148 12252 pts/2    S+   21:39   0:00 python3 /home/zms/anaconda3/envs/meta/bin/bowtie2-build --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
zms         9730  0.0  1.3 9533408 3496556 pts/2 Rl+  21:39 291:34 /home/zms/anaconda3/envs/meta/bin/bowtie2-build-s --wrapper basic-0 --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
zms         9854  0.0  0.0   9040   720 pts/1    S+   22:17   0:00 grep --color=auto bowtie2

#失败--trimmomatic的jar无法找到的问题
##解决方案参考--<https://github.com/bioconda/bioconda-recipes/issues/18666>

(base) zms@shpc-49506-instance-6QslimRj:~$ conda activate meta
#查找trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~$ which trimmomatic
/home/zms/anaconda3/envs/meta/bin/trimmomatic

#实际路径
(meta) zms@shpc-49506-instance-6QslimRj:~$ ls -lh /home/zms/anaconda3/envs/meta/bin/trimmomatic
lrwxrwxrwx 1 zms zms 39 Mar 11 22:35 /home/zms/anaconda3/envs/meta/bin/trimmomatic -> ../share/trimmomatic-0.39-2/trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~$ java -jar ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/trimmomatic.jar
Usage:
       PE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] [-validatePairs] [-basein <inputBase> | <inputFile1> <inputFile2>] [-baseout <outputBase> | <outputFile1P> <outputFile1U> <outputFile2P> <outputFile2U>] <trimmer1>...
   or:
       SE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] <inputFile> <outputFile> <trimmer1>...
   or:
       -version

#修改sh
kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ${host_db} --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/

kneaddata.sh

#!/bin/bash

if [ "$#" -eq "1" ];then
        prj_path=$1
#elif [ "$#" -eq "1" ];then
#       prj_path=$1
else
        echo "Usage: `basename $0` {prj_path(full path need)} {host_db(def:homo}"
        echo "You provided $# parameters,but 1 are required."
        exit 0
fi

host_db=~/metagenome/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio
threads_num=20

source ~/anaconda3/etc/profile.d/conda.sh
conda activate meta

# 建立结果文件夹
cd ${prj_path}
mkdir -p 01_remove_host 02_kraken2 03_Assembly/01_megahit 03_Assembly/02_prodigal 04_Quant 05_Annotation/01_humann 05_Annotation/02_rgi 05_Annotation/03_dbcan 05_Annotation/04_vfdb

echo "#### Step 1: 去宿主并质控 ####"
echo "##### run kneaddata to remove host reads... #####"
cd ${prj_path}/MetaOriginFile
for i in *
do
        if [ -d ${i} ];then
                if [ ! -d ${prj_path}/01_remove_host/${i%%.*} ];then
                {
                        echo "### Sample" ${i%%.*} "run kneaddata... ###"
                        mkdir ${prj_path}/01_remove_host/${i%%.*}
                        cd ${i}
                        kneaddata --input *1.clean.fq.gz --input *2.clean.fq.gz -db ${host_db} --reorder -t ${threads_num} -o ${prj_path}/01_remove_host/${i%%.*}/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
                        rm ${prj_path}/01_remove_host/${i%%.*}/*{_bowtie2_,unmatched}*
                        cd ..
                }
                else
                        echo "### ${i%%.*} has run kneaddata ###"
                fi
        fi
done
echo "##### Step 1: kneaddata done #####"

#添加可执行权限
chmod a+x kneaddata.sh

kneaddata运行及测试、报错解决

(meta) zms@shpc-49506-instance-6QslimRj:~$ ./kneaddata.sh ~ &
[1] 10271
(meta) zms@shpc-49506-instance-6QslimRj:~$ #### Step 1: 去宿主并质控 ####
##### run kneaddata to remove host reads... #####
### Sample Y0715C run kneaddata... ###
Decompressing gzipped file ...

#查询进度
ps aux | grep knead
zms        10276  0.0  0.0  22436 14972 pts/4    R    23:51   0:25 /home/zms/anaconda3/envs/meta/bin/python3.10 /home/zms/anaconda3/envs/meta/bin/keaddata --input1 Y0715C_1.clean.fq.gz --input2 Y0715C_2.clean.fq.gz -db /home/zms/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o /home/zms/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic /home/zms/anaconda3/envs/meta/bin
zms        10284  0.0  0.0   9040   656 pts/4    S+   23:52   0:00 grep --color=auto knead

##报错测试
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0715C$ kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic /home/zms/anaconda3/envs/meta/bin/trimmomatic
ERROR: Unable to list files in trimmomatic directory: /home/zms/anaconda3/envs/meta/bin/trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0715C$ kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output  --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/

###测试Y0715C
kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
##测试Y0718C-nohup不挂断运行命令
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0718C$ nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0718C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
[1] 17487
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0718C$ jobs
[1]+  Running                 nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0718C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &

#715失败，718成功--可能是命令运行一半终止--使用nohup命运重新运行
rm -rf Y0715C/
mkdir Y0715C Y0719C
#Y0715C
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
#Y0719C
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &

###查询进度
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0719C$ jobs
[1]-  Running                 nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &  (wd: ~/MetaOriginFile/Y0715C)
[2]+  Running                 nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &

###查看nohup执行命令的输出
cat MetaOriginFile/Y0715C/nohup.out
###失败---系统算力不足
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 48 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &

nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 48 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &

nohup ./kneaddata.sh ~/metagenome &

#pair.fastq没有内容-怀疑kneaddata版本过高-降版本 https://forum.biobakery.org/t/all-paired-end-read-unmatched/2895/7

conda remvoe kneaddata
conda install kneaddata=0.10

2.物种注释kraken2

https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown

kraken2下载及标准数据库构建

conda install -y kraken2

#下载kraken标准数据库
kraken2-build --standard --threads 96 --db kraken_standard 
#失败
(meta) zms@shpc-49506-instance-6QslimRj:~$ kraken2-build --standard --threads 96 --db kraken_standard 
Downloading nucleotide gb accession to taxon map.../home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/download_taxonomy.sh: line 27: rsync: command not found            

#安装文件同步工具rsync
(meta) zms@shpc-49506-instance-6QslimRj:~$ sudo apt-get install rsync     

#重新运行后
(meta) zms@shpc-49506-instance-6QslimRj:~$ ps aux | grep krak
zms        15633  0.0  0.0   9500  3304 pts/1    S    21:11   0:00 /bin/bash /home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/standard_installation.sh
zms        15634  0.0  0.0   9632  3576 pts/1    S    21:11   0:00 /bin/bash /home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/download_taxonomy.sh
zms        15639  0.0  0.0   8908   716 pts/1    S+   21:13   0:00 grep --color=auto krak

#下载失败--重新运行--加入后台运行命令
#删除当前目录所有文件
#rm -rf * ###慎用！！！
nohup kraken2-build --standard --threads 24 --db ~/metagenome/kraken_standard &

https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats

rsync相关报错
gzip: nucl_gb.accession2taxid.gz: invalid compressed data--format violated

#rsync报错解决方案 https://blog.csdn.net/lanzhoushidabian/article/details/129599041 https://github.com/DerrickWood/kraken2/issues/465#issuecomment-870726096 https://blog.csdn.net/weixin_52602016/article/details/120689897 #报错-gzip: nucl_gb.accession2taxid.gz: invalid compressed data--format violated解决方案 https://github.com/DerrickWood/kraken2/issues/545 #下载kraken库压缩包 https://benlangmead.github.io/aws-indexes/k2