TargetVet/VetHybPiper.sh at main · SethMusker/TargetVet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
#!/bin/bash

# To do:
#  1. If running on cluster, automatically add local R library path to scripts.

usage () {
   echo -e "## Bash script to use TargetVet to detect paralogs from data for many samples that have been assembled by HybPiper.
   ## It also analyses and reports patterns of missingness, and plots all of these results in various ways.
   ## ARGUMENTS

   ##--- The following are REQUIRED.
   #  -V **Absolute** directory of VetTargets source code: without the trailing "/"
   #  -D Directory of HybPiper output (i.e. with separate folders for each sample and for genes therein): without the trailing "/". Can be '.' if current directory.
   #  -T Targets fasta (nucleotide)
   #  -S File listing sample names to process
   #  -G File listing gene names to process
   #  -O Output directory suffix. Output will be written to a folder in the HybPiper directory and be named TargetVet_results_<suffix> or TargetVet_results_deduped_<minidentity>_<suffix> if -d=TRUE.
   #  -M LOGICAL. Does the target fasta contain multiple copies per gene (TRUE or FALSE)? If TRUE, gene names in the target fasta must follow HybPiper convention, E.g. Artocarpus-gene001 and Morus-gene001 are the same gene. (Note: The gene names file (-G) must still have just the gene names, i.e. gene001, for example.)

   ##--- The following arguments are OPTIONAL.
   #  -B What type of BLAST to use? Options currently are blastn or tblastx. The latter is potentially more sensitive but also much more time-consuming. Default = blastn.
   #  -s LOGICAL. Whether to blast each target's contigs to it separately.
   #     Much slower (use -X TRUE to speed it up) but it should be used
         if you expect that some of your targets are very similar to each other
         (e.g. if you've been trying to assemble paralogs individually)
   #  -E What e-value to use as a cutoff for keeping matches in blast results. Default = 1e-6.
   #  -L Minimum length of blast matches to keep for analysis. Default = 150bp.
   #  -K Minimum percent identity (pident) of blast matches to Keep for analysis. Default = 70.
   #  -I LOGICAL. Do IntronStats? default = TRUE
   #  -F LOGICAL. Force overwrite of DetectParalogs.R output? default = FALSE
   #  -C LOGICAL. Do per-chromosome/scaffold stats? default = TRUE
   #  -d LOGICAL. Deduplicate contigs before running analysis? default = FALSE. (Must have BBMap's dedupe.sh in current path. See https://sourceforge.net/projects/bbmap/)
   #  -m If -d=TRUE, min % identity threshold to identify duplicates? Default = 97 (to allow for removal of alleles.)
   #  -i File listing 'ingroup' samples. This is useful if you have several outgroup taxa, which often have different paralogy patterns (especially if they were used to design the target set). A separate paralog detection analysis will be conducted using only the ingroup samples.
   #  -p Rooted phylogeny in Newick format. If provided, will make an additional paralogy heatmap using this tree instead of the cluster dendrogram. All tip labels need to match those in the samples file.
   #  -P LOGICAL. Do plots for each sample with VetTargets_genome.R? Time-consuming and not necessary. Default = FALSE.
   #  -X LOGICAL. Parallelise certain operations using GNU Parallel? Default = FALSE
   #  -t Number of threads to use for multithreaded operations? This includes GNU parallel if -X is TRUE and BLAST if -s is FALSE.
   "
}
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
	usage
	exit
fi


## set defaults
DO_INTRON=TRUE
DO_PER_CHROM=TRUE
DEDUPE=FALSE
MINIDENTITY=97
LENGTH=150
PIDENT=70
FORCE=FALSE
DO_PLOTS=FALSE
BLAST_TYPE=blastn
PHYLO=NULL
INGROUP=NULL
PARALLEL=FALSE
THREADS=1
EVALUE=1e-6
SEPARATE=FALSE
## parse args
while getopts V:D:T:S:G:L:K:I:C:d:m:O:M:F:i:p:P:B:X:t:E:s: option
do
case "${option}"
in

V) VETDIR=${OPTARG};;
D) DIR=${OPTARG};;
T) TARGETS=${OPTARG};;
S) SAMPLES=${OPTARG};;
G) GENES=${OPTARG};;
L) LENGTH=${OPTARG};;
K) PIDENT=${OPTARG};;
I) DO_INTRON=${OPTARG};;
C) DO_PER_CHROM=${OPTARG};;
d) DEDUPE=${OPTARG};;
m) MINIDENTITY=${OPTARG};;
O) OUTSUFFIX=${OPTARG};;
M) MULTI=${OPTARG};;
F) FORCE=${OPTARG};;
i) INGROUP=${OPTARG};;
p) PHYLO=${OPTARG};;
P) DO_PLOTS=${OPTARG};;
B) BLAST_TYPE=${OPTARG};;
E) EVALUE=${OPTARG};;
X) PARALLEL=${OPTARG};;
t) THREADS=${OPTARG};;
s) SEPARATE=${OPTARG};;

esac
done

echo "Starting VetHybPiper.sh."

# Check that things will run.

if [[ ${PARALLEL} == "TRUE" ]] && [[ $(which env_parallel.bash | wc -l) -eq 0 ]]; then
   echo "You have chosen to use parallelisation but it does not seem to be in your PATH."
   echo -e "You can add it to your path by running \nexport PATH='\$PATH:path/to/parallel'"
   echo "Exiting."
   exit
   #echo "Setting parallel to FALSE and proceeding."
   #PARALLEL=FALSE
fi

if [[ ${DEDUPE} == "TRUE" ]] && [[ $(which dedupe.sh | wc -l) -eq 0 ]]; then
   echo "You have chosen to use deduplication but dedupe.sh it does not seem to be in your PATH."
   echo -e "You can add it to your path by running \nexport PATH='\$PATH:path/to/BBMap-or-BBTools'"
   echo "Exiting."
   exit
   #echo "Setting parallel to FALSE and proceeding."
   #PARALLEL=FALSE
fi

if [[ ${PARALLEL} == "TRUE" ]] && [[ ${THREADS} -eq 1 ]]; then
   echo "You have chosen to use parallelisation but not specified more than 1 thread. Use the -t argument or specify -X FALSE. Exiting."
   exit
fi

# we need to source env_parallel.bash in order to make env_parallel a function
# this has to be done *inside* VetHybPiper.sh because it is invoked as a separate bash process to
# the one running on the compute node (assuming this is running on a cluster)
if [[ ${PARALLEL} == "TRUE" ]]; then
	. `which env_parallel.bash`
fi

# Sort out directories
if [[ ${DIR} == "." ]]; then
   DIR=$PWD
fi

if [[ ! -f ${GENES} ]] && [[ ! -f ${DIR}/${GENES} ]]; then
   echo "Couldn't find gene list ${GENES}. Exiting."
   exit
fi
if [[ ! -f ${SAMPLES} ]] && [[ ! -f ${DIR}/${SAMPLES} ]]; then
   echo "Couldn't find sample list ${SAMPLES}. Exiting."
   exit
fi
if [[ ! -f ${TARGETS} ]] && [[ ! -f ${DIR}/${TARGETS} ]]; then
   echo "Couldn't find target fasta ${TARGETS}. Exiting."
   exit
fi


# set the name of the output directory
if [[ ${DEDUPE} == "TRUE" ]]; then
  echo -e "Ah, so you have chosen deduplication. Summoning Bestus Bioinformaticus! (aka BBTools).\n Using MINIDENTITY=${MINIDENTITY}"
  OUTDIR=${DIR}/TargetVet_results_deduped_${MINIDENTITY}_${OUTSUFFIX}
else
  OUTDIR=${DIR}/TargetVet_results_${OUTSUFFIX}
fi

# (FOR NOW, DON'T UNCOMMENT THIS!) Set the name of the output directory (don't put it in the HybPiper directory)  -- This needs a lot of tweaking to make it work...
# if [[ ${DEDUPE} == "TRUE" ]]; then
   # echo -e "Ah, so you have chosen deduplication. Summoning Bestus Bioinformaticus! (aka BBTools).\n Using MINIDENTITY=${MINIDENTITY}"
   # OUTDIR=TargetVet_results_deduped_${MINIDENTITY}_${OUTSUFFIX}
# else
   # OUTDIR=TargetVet_results_${OUTSUFFIX}
# fi


echo "All output will be written to ${OUTDIR}"
mkdir -p ${OUTDIR}

# If full paths are provided and they're not pointing to files within ${DIR} subdirectories,
# change them to basename and copy into ${DIR} if they're not already there.
is_full_path () { echo $1 | grep '/' | wc -l; }
FP=$(is_full_path ${GENES})
if [[ $FP -eq 1 ]] && [[ ! -f ${DIR}/${GENES} ]]; then
   if [[ ! -f ${DIR}/$(basename ${GENES}) ]];then
      cp ${GENES} ${DIR}
   fi
   GENES=$(basename ${GENES})
fi
FP=$(is_full_path ${SAMPLES})
if [[ $FP -eq 1 ]] && [[ ! -f ${DIR}/${SAMPLES} ]]; then
   if [[ ! -f ${DIR}/$(basename ${SAMPLES}) ]];then
      cp ${SAMPLES} ${DIR}
   fi
   SAMPLES=$(basename ${SAMPLES})
fi
FP=$(is_full_path ${TARGETS})
if [[ $FP -eq 1 ]] && [[ ! -f ${DIR}/${TARGETS} ]]; then
   if [[ ! -f ${DIR}/$(basename ${TARGETS}) ]];then
      cp ${TARGETS} ${DIR}
   fi
   TARGETS=$(basename ${TARGETS})
fi

# Fix CRLF line endings if they exist, as they mess with the while loops
CRLF=$(file ${DIR}/${GENES} | grep 'CRLF' | wc -l)
if [[ $CRLF -eq 1 ]]; then
   echo "Gene list ${DIR}/${GENES} has CRLF line endings. Creating fixed file."
   sed 's/\r$//' ${DIR}/${GENES} > ${DIR}/${GENES}.CRLF-corrected.txt
   GENES=${GENES}.CRLF-corrected.txt
fi
CRLF=$(file ${DIR}/${SAMPLES} | grep 'CRLF' | wc -l)
if [[ $CRLF -eq 1 ]]; then
   echo "Sample list ${DIR}/${SAMPLES} has CRLF line endings. Creating fixed file."
   sed 's/\r$//' ${DIR}/${SAMPLES} > ${DIR}/${SAMPLES}.CRLF-corrected.txt
   SAMPLES=${SAMPLES}.CRLF-corrected.txt
fi


if [[ ${SEPARATE} == "FALSE" ]];then
   # 1. make TargetVet results folder and collate spades contigs per sample
   mkdir -p ${OUTDIR}/assemblies_collated
   while read i;do
      CONTIGS=${OUTDIR}/assemblies_collated/${i}_all_contigs.fasta
      if [[ -f ${CONTIGS} ]]; then
         echo "Collated contigs exist for ${i}. Skipping."
      else
         echo "Collating contigs for ${i}."

         if [[ ${PARALLEL} == "TRUE" ]]; then
            ## parallel cat
            cat_contigs(){
               if [[ -f ${DIR}/${i}/${1}/${1}_contigs.fasta.gz ]]; then
                     gzip -d ${DIR}/${i}/${1}/${1}_contigs.fasta.gz
               fi
               cat ${DIR}/${i}/${1}/${1}_contigs.fasta
            }
            # export -f cat_contigs
            echo "Collating contigs in parallel for sample ${i}."
            env_parallel --env cat_contigs \
               --env DIR \
               --env i \
               -j ${THREADS} "cat_contigs {}" :::: ${DIR}/${GENES} > ${CONTIGS}
         else
            while read g; do
               if [[ -f ${DIR}/${i}/${g}/${g}_contigs.fasta.gz ]]; then
                  gzip -d ${DIR}/${i}/${g}/${g}_contigs.fasta.gz
               fi
               cat ${DIR}/${i}/${g}/${g}_contigs.fasta
            done < ${DIR}/${GENES} > ${CONTIGS}
         fi

         if [[ ${DEDUPE} == "TRUE" ]] && [[ ${PARALLEL} == "FALSE" ]]; then
            dedupe.sh in=${CONTIGS} out=${OUTDIR}/assemblies_collated/${i}_all_contigs_deduped_${MINIDENTITY}.fasta threads=1 minidentity=${MINIDENTITY} 2> ${CONTIGS}.dedupe.log
         fi
      fi
   done < ${DIR}/${SAMPLES}

   if [[ ${DEDUPE} == "TRUE" ]] && [[ ${PARALLEL} == "TRUE" ]]; then
      echo "Running dedupe.sh on collated contigs in parallel across samples."
      env_parallel \
         --env OUTDIR \
         --env MINIDENTITY \
         -j ${THREADS} "dedupe.sh in=${OUTDIR}/assemblies_collated/{}_all_contigs.fasta out=${OUTDIR}/assemblies_collated/{}_all_contigs_deduped_${MINIDENTITY}.fasta threads=1 minidentity=${MINIDENTITY} 2> ${OUTDIR}/assemblies_collated/{}_all_contigs.fasta.dedupe.log" :::: ${DIR}/${SAMPLES}
   fi

   # 2. blast to TARGETS
   mkdir -p ${OUTDIR}/blast_out
   run_blast(){
      local BLASTOUT=${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${1}_all_contigs.txt
      if [[ -f ${BLASTOUT} ]]; then
         echo "BLAST output exists for ${1}. Skipping. If you wish to redo the ${BLAST_TYPE} step, move the folder 'blast_out' to another directory, rename it to something else, or remove the relevant files in the blast_out directory with the prefix ${BLAST_TYPE}."
      else
         # if blast output doesn't exist, run blast
         if [[ ${DEDUPE} == "TRUE" ]]; then
            local SUBJECT=${OUTDIR}/assemblies_collated/${1}_all_contigs_deduped_${MINIDENTITY}.fasta
         else
            local SUBJECT=${OUTDIR}/assemblies_collated/${1}_all_contigs.fasta
         fi

         if [[ ${BLAST_TYPE} == "tblastx" ]]; then
            echo "Mapping contigs to targets using tblastx for ${1}."
            tblastx -query ${DIR}/${TARGETS} \
               -subject ${SUBJECT} \
               -out ${BLASTOUT} \
               -evalue ${EVALUE} \
               -outfmt "6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore"
         else
            echo "Mapping contigs to targets using blastn for ${1}."
            blastn -query ${DIR}/${TARGETS} \
               -subject ${SUBJECT} \
               -out ${BLASTOUT} \
               -evalue ${EVALUE} \
               -outfmt "6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore"
         fi
      fi
   }

   if [[ ${PARALLEL} == "FALSE" ]]; then
      while read i;do
         run_blast ${i}
      done < ${DIR}/${
	  }
   else
      echo "Running ${BLAST_TYPE} in parallel across samples."
      env_parallel --env DIR \
		 --env run_blast \
         --env OUTDIR \
         --env BLAST_TYPE \
         --env TARGETS \
         --env DEDUPE \
         --env MINIDENTITY \
         --env EVALUE \
         -j ${THREADS} "run_blast {}" :::: ${DIR}/${SAMPLES}
   fi
else
  ## If separate == "TRUE"
   mkdir -p ${OUTDIR}/blast_out
   echo "Running separate blast searches for each target and its contigs."
   TEMP=${OUTDIR}/temp
   mkdir -p ${TEMP}
   ## if SEPARATE=TRUE, we need to skip contig collation and run blast individually for each target
   # we need to generate a gene list of genes in the actual target file (because we might have multiple copies of each target)
   grep '^>' ${DIR}/${TARGETS} | sed 's/>//g' | sed 's/\r$//' > ${TEMP}/gene_names.txt
   echo "Writing targets to separate fastas in ${TEMP} if they don't exist."
   # pull out the targets from the target file
   # grep command from https://www.biostars.org/p/49820/
   while read T;do
      if [[ ${MULTI} == "TRUE" ]]; then
         T_ID=$(echo ${T} | cut -d'-' -f2)
      else
         T_ID=${T}
      fi
	  if [[ ! -f ${TEMP}/${T_ID}_target.fasta ]];then
		echo ">${T}" > ${TEMP}/${T_ID}_target.fasta
		grep -A 10000 -w ${T} ${DIR}/${TARGETS} | sed -n -e '1,/>/ {/>/ !{'p''}} >> ${TEMP}/${T_ID}_target.fasta
		# awk -v seq="${T}" -v RS='>' '$1 == seq {print RS $0}' ${DIR}/${TARGETS} > ${TEMP}/${T}_target.fasta
	  else
		echo ">${T}" >> ${TEMP}/${T_ID}_target.fasta
		grep -A 10000 -w ${T} ${DIR}/${TARGETS} | sed -n -e '1,/>/ {/>/ !{'p''}} >> ${TEMP}/${T_ID}_target.fasta
	  fi
   done < ${TEMP}/gene_names.txt
   # this function needs to be called for each gene, and must
   #  1. make a temporary fasta containing only the gene's sequence in the target file
   #  2. blast the contigs assembled for that gene to the temporary fasta
   #     and write each result to a separate temporary output file, (these will be concatenated later)
   run_blast_separately(){
      # local CURRENT_GENE_CONTIGS=${DIR}/${i}/${1}/${1}_contigs.fasta
      if [[ -f ${DIR}/${i}/${1}/${1}_contigs.fasta ]];then
         if [[ ! -f ${TEMP}/${BLAST_TYPE}_${i}_to_${1}.txt ]]; then
            if [[ ${BLAST_TYPE} == "tblastx" ]]; then
               tblastx -query ${TEMP}/${1}_target.fasta \
                  -subject ${DIR}/${i}/${1}/${1}_contigs.fasta \
                  -out ${TEMP}/${BLAST_TYPE}_${i}_to_${1}.txt \
                  -evalue ${EVALUE} \
                  -outfmt "6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore"
            else
               blastn -query ${TEMP}/${1}_target.fasta \
                  -subject ${DIR}/${i}/${1}/${1}_contigs.fasta \
                  -out ${TEMP}/${BLAST_TYPE}_${i}_to_${1}.txt \
                  -evalue ${EVALUE} \
                  -outfmt "6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore"
            fi
         fi
      fi
   }
   # now we loop over samples and
   # 1. blast targets to their contigs separately either as a loop or in parallel, then
   # 2. concatenate the results into one blast file to pass to VetTargets_genome.R
   while read i;do
    if [[ -f ${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${i}_all_contigs.txt ]];then
	  echo "BLAST output exists for ${i}. Skipping. If you wish to redo the ${BLAST_TYPE} step, move the folder 'blast_out' to another directory, rename it to something else, or remove the relevant files in the blast_out directory with the prefix ${BLAST_TYPE}."
    else
      echo "Mapping contigs to targets using ${BLAST_TYPE} for ${i}, separately for each target's contigs."
      if [[ ${PARALLEL} == "FALSE" ]]; then
         while read g; do
            run_blast_separately ${g}
         done < ${DIR}/${GENES}
      else
      echo "Doing this in parallel..."
         env_parallel --env run_blast_separately \
            --env OUTDIR \
            --env BLAST_TYPE \
            --env TARGETS \
            --env TEMP \
            --env DIR \
            --env EVALUE \
            --env i \
            -j ${THREADS} "run_blast_separately {}" :::: ${DIR}/${GENES}
      fi
      # concatenate results
      cat ${TEMP}/${BLAST_TYPE}_${i}_to_*.txt \
         > ${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${i}_all_contigs.txt
    fi
   done < ${DIR}/${SAMPLES}
# rm -rf ${TEMP}
fi

# 3. run VetTargets_genome.R
mkdir -p ${OUTDIR}/VetTargets_genome_output
cd ${OUTDIR}/VetTargets_genome_output
run_VetTargets(){
      if [[ -f ${OUTDIR}/VetTargets_genome_output/${1}_CoverageStats_AcrossChromosomes.txt ]]; then
         echo "VetTargets_genome coverage stats output exists for ${1}. Skipping."
      else
         echo "Running VetTargets_genome on ${1}."
         local BL=${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${1}_all_contigs.txt
         local BLH=${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${1}_all_contigs.withHeader.txt
         echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqlen\tqstart\tqend\tslen\tsstart\tsend\tevalue\tbitscore" | \
            cat - ${BL} > ${BLH}

         Rscript ${VETDIR}/VetTargets_genome.R --blast_file ${BLH} \
            --output_prefix ${1} \
            --min_fragment_length ${LENGTH} \
            --min_pident ${PIDENT} \
            --max_intron_length 10000 \
            --max_intron_percent 1 \
            --min_display_intron 10 \
            --doPlots ${DO_PLOTS} \
            --doIntronStats ${DO_INTRON} \
            --doCovPerChrom ${DO_PER_CHROM} \
            --multicopyTarget ${MULTI} \
            --genelist ${DIR}/${GENES} \
            --blast_type ${BLAST_TYPE}
      fi
   }
if [[ ${PARALLEL} == "FALSE" ]]; then
   while read i;do
      run_VetTargets ${i}
   done < ${DIR}/${SAMPLES}
else
   echo "Running VetTargets_genome.R in parallel across samples."
   env_parallel --env run_VetTargets \
      --env OUTDIR \
      --env BLAST_TYPE \
      --env TARGETS \
      --env VETDIR \
      --env LENGTH \
      --env PIDENT \
      --env DO_PLOTS \
      --env DO_INTRON \
      --env DO_PER_CHROM \
      --env MULTI \
      --env DIR \
      --env GENES \
      -j ${THREADS} "run_VetTargets {}" :::: ${DIR}/${SAMPLES}
fi

# 4. collate CoverageStats_AcrossChromosomes.txt
# AND
# 5. detect paralogs using paralogy Rate and step-function/nplr analysis
# AND
# 6. Output genelists for paralogs and single-copy

if [[ ${MULTI} == "TRUE" ]]; then
   # get prefixes so as to check for pre-existing covstats and run DetectParalogs.R on each separately
   i=`head -n1 ${DIR}/${SAMPLES}`
   BL=${OUTDIR}/blast_out/${BLAST_TYPE}_`basename ${TARGETS}`_to_${i}_all_contigs.txt
   cut -d'-' -f1 ${BL} | sort | uniq > targetsourcenames.txt
fi

if [[ -f ${INGROUP} ]];then
   echo "Ingroup file found."
   ING="-i ${INGROUP}"
else
   echo "Ingroup file not found or not provided."
   ING=""
fi

if [[ -f ${PHYLO} ]];then
   echo "Phylogeny file found."
   PHY="-p ${PHYLO}"
else
   echo "Phylogeny file not found or not provided."
   PHY=""
fi


echo "Detecting paralogs..."
if [[ ${MULTI} == "TRUE" ]]; then
   while read TSN; do
      echo "Working on $TSN..."
      Rscript ${VETDIR}/DetectParalogs.R -s ${DIR}/${SAMPLES} \
         -d ${OUTDIR}/VetTargets_genome_output/${TSN} \
         -o ${OUTDIR}/DetectParalogs_output/${TSN} \
         -f ${FORCE} \
         ${ING} \
         ${PHY}
      echo "Finished $TSN."
  done < targetsourcenames.txt
else
   Rscript ${VETDIR}/DetectParalogs.R \
      -s ${DIR}/${SAMPLES} \
      -d ${OUTDIR}/VetTargets_genome_output \
      -o ${OUTDIR}/DetectParalogs_output \
      -f ${FORCE} \
      ${ING} \
      ${PHY}
fi

echo "VetHybPiper.sh finished!"