#安装质控软件
conda install -y fastqc
#安装宏基因组质控软件
conda install -y kneaddata
#安装失败,先装python3,再装bowtie2
conda install --channel conda-forge python=3.10
conda install -y bowtie2
https://github.com/biobakery/biobakery/wiki/kneaddata
#下载参考基因组--斑马鱼:GCA_000002035.4;新建文件夹host_db存放
#<https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000002035.6/>
#安装curl
sudo apt install curl
#下载
curl "<https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000002035.6/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED>" -o Danio_rerio.GRCz11
#后台运行
curl "<https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000002035.6/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&include_annotation_type=RNA_FASTA&include_annotation_type=CDS_FASTA&include_annotation_type=PROT_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED>" -o Danio_rerio.GRCz11 -s &
#检查命令运行是否完成和文件大小--未完成--重新运行
(base) zms@shpc-49506-instance-6QslimRj:~/host_db$ ps aux | grep curl
zms 9180 0.0 0.0 8908 720 pts/1 S+ 13:40 0:00 grep --color=auto curl
(base) zms@shpc-49506-instance-6QslimRj:~/host_db$ ls -lh
total 14M
-rw-rw-r-- 1 zms zms 14M Mar 11 23:00 Danio_rerio.GRCz11
#文件解压
unzip Danio_rerio.GRCz11
sudo apt install tree
#创建定制数据库--斑马鱼 <https://www.jianshu.com/p/cc306b201707>
bowtie2-build --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
#查看进度
(meta) zms@shpc-49506-instance-6QslimRj:~$ ps aux|grep bowtie2
zms 9728 0.0 0.0 21148 12252 pts/2 S+ 21:39 0:00 python3 /home/zms/anaconda3/envs/meta/bin/bowtie2-build --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
zms 9730 0.0 1.3 9533408 3496556 pts/2 Rl+ 21:39 291:34 /home/zms/anaconda3/envs/meta/bin/bowtie2-build-s --wrapper basic-0 --threads 30 GCF_000002035.6_GRCz11_genomic.fna Danio_rerio
zms 9854 0.0 0.0 9040 720 pts/1 S+ 22:17 0:00 grep --color=auto bowtie2
#失败--trimmomatic的jar无法找到的问题
##解决方案参考--<https://github.com/bioconda/bioconda-recipes/issues/18666>
(base) zms@shpc-49506-instance-6QslimRj:~$ conda activate meta
#查找trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~$ which trimmomatic
/home/zms/anaconda3/envs/meta/bin/trimmomatic
#实际路径
(meta) zms@shpc-49506-instance-6QslimRj:~$ ls -lh /home/zms/anaconda3/envs/meta/bin/trimmomatic
lrwxrwxrwx 1 zms zms 39 Mar 11 22:35 /home/zms/anaconda3/envs/meta/bin/trimmomatic -> ../share/trimmomatic-0.39-2/trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~$ java -jar ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/trimmomatic.jar
Usage:
PE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] [-validatePairs] [-basein <inputBase> | <inputFile1> <inputFile2>] [-baseout <outputBase> | <outputFile1P> <outputFile1U> <outputFile2P> <outputFile2U>] <trimmer1>...
or:
SE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] <inputFile> <outputFile> <trimmer1>...
or:
-version
#修改sh
kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ${host_db} --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
#!/bin/bash
if [ "$#" -eq "1" ];then
prj_path=$1
#elif [ "$#" -eq "1" ];then
# prj_path=$1
else
echo "Usage: `basename $0` {prj_path(full path need)} {host_db(def:homo}"
echo "You provided $# parameters,but 1 are required."
exit 0
fi
host_db=~/metagenome/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio
threads_num=20
source ~/anaconda3/etc/profile.d/conda.sh
conda activate meta
# 建立结果文件夹
cd ${prj_path}
mkdir -p 01_remove_host 02_kraken2 03_Assembly/01_megahit 03_Assembly/02_prodigal 04_Quant 05_Annotation/01_humann 05_Annotation/02_rgi 05_Annotation/03_dbcan 05_Annotation/04_vfdb
echo "#### Step 1: 去宿主并质控 ####"
echo "##### run kneaddata to remove host reads... #####"
cd ${prj_path}/MetaOriginFile
for i in *
do
if [ -d ${i} ];then
if [ ! -d ${prj_path}/01_remove_host/${i%%.*} ];then
{
echo "### Sample" ${i%%.*} "run kneaddata... ###"
mkdir ${prj_path}/01_remove_host/${i%%.*}
cd ${i}
kneaddata --input *1.clean.fq.gz --input *2.clean.fq.gz -db ${host_db} --reorder -t ${threads_num} -o ${prj_path}/01_remove_host/${i%%.*}/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
rm ${prj_path}/01_remove_host/${i%%.*}/*{_bowtie2_,unmatched}*
cd ..
}
else
echo "### ${i%%.*} has run kneaddata ###"
fi
fi
done
echo "##### Step 1: kneaddata done #####"
#添加可执行权限
chmod a+x kneaddata.sh
(meta) zms@shpc-49506-instance-6QslimRj:~$ ./kneaddata.sh ~ &
[1] 10271
(meta) zms@shpc-49506-instance-6QslimRj:~$ #### Step 1: 去宿主并质控 ####
##### run kneaddata to remove host reads... #####
### Sample Y0715C run kneaddata... ###
Decompressing gzipped file ...
#查询进度
ps aux | grep knead
zms 10276 0.0 0.0 22436 14972 pts/4 R 23:51 0:25 /home/zms/anaconda3/envs/meta/bin/python3.10 /home/zms/anaconda3/envs/meta/bin/keaddata --input1 Y0715C_1.clean.fq.gz --input2 Y0715C_2.clean.fq.gz -db /home/zms/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o /home/zms/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic /home/zms/anaconda3/envs/meta/bin
zms 10284 0.0 0.0 9040 656 pts/4 S+ 23:52 0:00 grep --color=auto knead
##报错测试
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0715C$ kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic /home/zms/anaconda3/envs/meta/bin/trimmomatic
ERROR: Unable to list files in trimmomatic directory: /home/zms/anaconda3/envs/meta/bin/trimmomatic
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0715C$ kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz -db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
###测试Y0715C
kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/
##测试Y0718C-nohup不挂断运行命令
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0718C$ nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0718C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
[1] 17487
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0718C$ jobs
[1]+ Running nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0718C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
#715失败,718成功--可能是命令运行一半终止--使用nohup命运重新运行
rm -rf Y0715C/
mkdir Y0715C Y0719C
#Y0715C
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
#Y0719C
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
###查询进度
(meta) zms@shpc-49506-instance-6QslimRj:~/MetaOriginFile/Y0719C$ jobs
[1]- Running nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ & (wd: ~/MetaOriginFile/Y0715C)
[2]+ Running nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 96 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
###查看nohup执行命令的输出
cat MetaOriginFile/Y0715C/nohup.out
###失败---系统算力不足
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 48 -o ~/01_remove_host/Y0715C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
nohup kneaddata --input1 *1.clean.fq.gz --input2 *2.clean.fq.gz --reference-db ~/host_db/ncbi_dataset/data/GCF_000002035.6/Danio_rerio --reorder -t 48 -o ~/01_remove_host/Y0719C/ --remove-intermediate-output --trimmomatic ~/anaconda3/envs/meta/share/trimmomatic-0.39-2/ &
nohup ./kneaddata.sh ~/metagenome &
#pair.fastq没有内容-怀疑kneaddata版本过高-降版本 https://forum.biobakery.org/t/all-paired-end-read-unmatched/2895/7
conda remvoe kneaddata
conda install kneaddata=0.10
https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown
conda install -y kraken2
#下载kraken标准数据库
kraken2-build --standard --threads 96 --db kraken_standard
#失败
(meta) zms@shpc-49506-instance-6QslimRj:~$ kraken2-build --standard --threads 96 --db kraken_standard
Downloading nucleotide gb accession to taxon map.../home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/download_taxonomy.sh: line 27: rsync: command not found
#安装文件同步工具rsync
(meta) zms@shpc-49506-instance-6QslimRj:~$ sudo apt-get install rsync
#重新运行后
(meta) zms@shpc-49506-instance-6QslimRj:~$ ps aux | grep krak
zms 15633 0.0 0.0 9500 3304 pts/1 S 21:11 0:00 /bin/bash /home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/standard_installation.sh
zms 15634 0.0 0.0 9632 3576 pts/1 S 21:11 0:00 /bin/bash /home/zms/anaconda3/envs/meta/share/kraken2-2.1.3-0/libexec/download_taxonomy.sh
zms 15639 0.0 0.0 8908 716 pts/1 S+ 21:13 0:00 grep --color=auto krak
#下载失败--重新运行--加入后台运行命令
#删除当前目录所有文件
#rm -rf * ###慎用!!!
nohup kraken2-build --standard --threads 24 --db ~/metagenome/kraken_standard &
https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats
#rsync报错解决方案 https://blog.csdn.net/lanzhoushidabian/article/details/129599041https://github.com/DerrickWood/kraken2/issues/465#issuecomment-870726096https://blog.csdn.net/weixin_52602016/article/details/120689897 #报错-gzip: nucl_gb.accession2taxid.gz: invalid compressed data--format violated解决方案 https://github.com/DerrickWood/kraken2/issues/545 #下载kraken库压缩包 https://benlangmead.github.io/aws-indexes/k2