#! /usr/bin
#hypothetical.sh
###############
while read isolate;
do
echo "**********Starting $isolate************* "
#mkdir result_files
#This perl code extracts all protein fasta sequences with the pattern "HYPOTHETICAL' from the proteome of $isolate and outputs into a file $isolate_hypothetical
perl /home/pseema/denovo_analysis/fasta_extraction.pl -m 'hypothetical' /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa > /home/pseema/denovo_analysis/result_files/$isolate.hypothetical
#cat /home/pseema/denovo_analysis/$isolate.hypothetical
#This perl code determines the length of each hypothetical protein sequences and result is directed to a file
perl /home/pseema/denovo_analysis/fasta_length.pl < /home/pseema/denovo_analysis/result_files/$isolate.hypothetical > /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical
#cat /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical
#The lengths are sorted numerically to find the smallest as well as longest protein sequence; result is output to a file
sort -n /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical > /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical
#cat /home/pseema/denovo_analysis/sorted_length_$isolate.hypothetical
#Total gene numbers and hypothetical protein numbers was generated, percentage was calculated
echo "####Total number of genes in isolate is####"
grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l
grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l > /home/pseema/denovo_analysis/result_files/$isolate.total_number
x=$(grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l)
#echo "Total number of protein sequence is: "
#echo $x
echo "####Total number of hypothetical protein sequence is####"
grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l
grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l > /home/pseema/denovo_analysis/result_files/$isolate.only_hypothetical
y=$(grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l)
#echo "Total number of hypothetical protein sequence is: : "
#echo $y
percent=$((100*$y/$x))
echo "####The percentage of hypothetical genes is#### "
echo $percent
#cat /home/pseema/denovo_analysis/sorted_length_$isolate.hypothetical
echo "####Amino acid length of the smallest hypothetical protein is####"
cut -f1 -d"," /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | head -1
echo "####Amino acid length of the largest hypothetical protein is####"
cut -f1 -d"," /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | tail -1
echo "####The numbers of hypothetical proteins with length 50 or below are####"
cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=50)print $1}' | sort -n | wc -l
z=$(cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=50)print $1}' | sort -n | wc -l)
echo "####The numbers of hypothetical proteins with length 100 or below are#### "
cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=100)print $1}' | sort -n | wc -l
z=$(cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=100)print $1}' | sort -n | wc -l)
#echo "value of z is: "
#echo $z
percentage=$((100*$z/$y))
echo "###The percentage of hypothetical proteins with length less than 100aa is###"
echo $percentage
#mkdir log_files
echo "#################Number of CRISPRs in the $isolate#################"
grep "CRISPR" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "#################Number of spacers in the $isolate#################"
grep "spacers" /home/pseema/denovo_analysis/log_files/$isolate.log | wc -l
echo "#################Number of tRNAs in the $isolate###################"
grep "tRNAs" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "#################Number of Contig in the #isolate###################"
grep "Contig" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "*************$isolate genome done**************"
done < /home/pseema/denovo_analysis/input_files/isolate_list
-----------------------------------
#! /usr/bin
#hypothetical_profile_analysis.sh
#It takes hypothetical_profile as input file
echo "*********Range of total number of genes in the isolates********"
#This code prints the next line following the pattern 'Total number of genes in isolate is' in the input file
awk 'f{print;f=0} /Total number of genes in isolate is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/total_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/total_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_total_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes
echo "******Sorted as per gene numbers*******"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes |sort -k2,2
#Sort, find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_total_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted
echo "Lower limit of genes :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted | head -1
echo "Upperlimit of genes :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted | tail -1
echo "*********Range of total number of hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Total number of hypothetical protein sequence is' in the input file
awk 'f{print;f=0} /Total number of hypothetical protein sequence is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/hypothetical_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/hypothetical_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes
echo "******Sorted as per hypothetical protein numbers********"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted
echo "Lower limit of hypothetical proteins :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted | head -1
echo "Upper limit of hypothetical proteins :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted | tail -1
echo "*********Range of percentage of hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'The percentage of hypothetical genes is' in the input file
awk 'f{print;f=0} /The percentage of hypothetical genes is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/percentage_hypothetical_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/percentage_hypothetical_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes
echo "******Sorted as per hypothetical protein percentage*********"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted
echo "Lower limit of hypothetical protein percentage :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted | head -1
echo "Upper limit of hypothetical protein percentage :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted | tail -1
echo "*********Range of smallest hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Amino acid length of the smallest hypothetical protein is' in the input file
awk 'f{print;f=0} /Amino acid length of the smallest hypothetical protein is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/smallest_hypothetical_protein
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/smallest_hypothetical_protein | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein
echo "******Sorted as per smallest hypothetical protein order"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted
echo "Lower limit of smallest protein:"
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted | head -1
echo "Upper limit of smallest protein :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted | tail -1
echo "*********Range of largest hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Amino acid length of the largest hypothetical protein is' in the input file
awk 'f{print;f=0} /Amino acid length of the largest hypothetical protein is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/largest_hypothetical_protein
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/largest_hypothetical_protein | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein
echo "******Sorted as per largest hypothetical protein order"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted
echo "Lower limit of largest protein:"
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted | head -1
echo "Upper limit of largest protein :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted | tail -1
echo "*********Range of hypothetical proteins with length less than 50aa in the isolates*********"
#This code prints the next line following the pattern 'The numbers of hypothetical proteins with length 50 or below are' in the input file
awk 'f{print;f=0} /The numbers of hypothetical proteins with length 50 or below are/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/length_less_than_50
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/length_less_than_50 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_50
cat /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_50
#################
echo "*********Range of hypothetical proteins with length less than 100aa in the isolates*********"
#This code prints the next line following the pattern 'The numbers of hypothetical proteins with length 100 or below are' in the input file
awk 'f{print;f=0} /The numbers of hypothetical proteins with length 100 or below are/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/length_less_than_100
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/length_less_than_100 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_100
cat /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_100
#################
echo "*********Range of percentage of hypothetical proteins with length less than 100aa in the isolates*********"
#This code prints the next line following the pattern 'The percentage of hypothetical proteins with length less than 100aa is' in the input file
awk 'f{print;f=0} /The percentage of hypothetical proteins with length less than 100aa is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/percentage_length_less_than_100
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/percentage_length_less_than_100 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_length_less_than_100
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_length_less_than_100
#################
echo "*********Range of number of CRISPRs in the isolates*********"
#This code prints the line with the pattern 'Total number of genes in isolate is' in the input file
awk '/Found/ && /CRISPRs/' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/CRISPR_profile
awk '{print $3}' < /home/pseema/denovo_analysis/result_files/CRISPR_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_CRISPR_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_CRISPR_profile
#################
echo "*********Range of number of spacers in the isolates*********"
#This code prints the next line following the pattern 'Number of spacers in the' in the input file
awk 'f{print;f=0} /Number of spacers in the/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/spacer_profile
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/spacer_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_spacer_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_spacer_profile
#################
echo "*********Range of tRNAs in the isolates*********"
#This code prints the next line following the pattern 'Predicting tRNAs and tmRNAs' in the input file
awk 'f{print;f=0} /Predicting tRNAs and tmRNAs/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/tRNA_profile
awk '{print $3}' < /home/pseema/denovo_analysis/result_files/tRNA_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_tRNA_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_tRNA_profile
#################
--------------------------
#! usr/bin/bash
#This (hypothetical_analysis_wrapper.sh) is wrapper for the all the hypothetical analysis-related codes.
#It analyzes the hypothetical distribution among the 51 denovo + 2 reference strains.
#The details of the input files and paths have been mentioned the README.txt
#########This code calls hypothetical_profile.sh############
#It extracts all critical informations regarding genome, number of hypothetical proteins, their size, percentage of total genome, other genome characteristics.
#The result is directed into a data_file
echo "hypothetical_profile.sh is running.........."
sh hypothetical_profile.sh |& tee hypothetical_profile
cat hypothetical_profile
#########This code calls hypothetical_profile_analysis.sh############
#Its input file is hypothetical_profile. The code analyzes hypothetical protein pattern across isolates
echo "hypothetical_profile_analysis.sh is running.........."
sh hypothetical_profile_analysis.sh |& tee hypothetical_pattern
############This code calls fasta_manipulation.sh############
#Its input file is isolate.faa . The code analyzes smallest functional protein length and name, largest functional protein length and name, ESAT, ESX, transposase, efflux, FabG, antitoxin, ribonuclease, PE, PPE, universal stress protein pattern across isolates.
echo "fasta_manipulation.sh is running.........."
sh fasta_manipulation.sh |& tee fasta_manipulation_data_file
############This code calls fasta_manipulation_analysis.sh############
#Its input file is fasta_manipulation_data_file. This code creates a table of smallest functional protein length, largest functional protein length, country collected from, number of hypothetical proteins, ESAT, ESX, transposase, Efflux, FabG, antitoxin, ribonuclease, PE, PPE, universal stress protein pattern in isolates.
echo "fasta_manipulation_analysis.sh is running.........."
sh fasta_manipulation_analysis.sh
#The output file is final_fasta_manipulation_analysis_file (generated in hypothetical_analysis directory)
############This code calls coiled.sh############
#Its input file is isolate_hypothetical and isolate_list. A while loop is used to find the M. tuberculosis hypothetical gene coiled coil motifs (24 hypothetical geneMY SCRIPT (7): Hypothetical gene profile and their analysis ........MY SCRIPT (7): Hypothetical gene profile and their analysis ........MY SCRIPT (7): Hypothetical gene profile and their analysis ........s with motifs derived by SMART) in the de novo isolate hypothetical genes. Output files are isolate.coiled_motif_found_gene
echo "coiled.sh is running.........."
sh coiled.sh |& tee hypothetical_coiled_motif_found
############This code calls coiled_analysis.sh############
#Its input file is files generated by running coiled.sh and isolate_list. The code uses two while loops. Data is output to the monitor. This code can be executed by changing the isolate_list to lineage_isolates.
echo "coiled_analysis.sh is running.........."
sh coiled_analysis.sh |& tee hypothetical_coiled_motif_pattern_locations
############This code calls membrane.sh############
#Its input file is isolate_hypothetical and isolate_list. A while loop is used to find the M. tuberculosis hypothetical gene membrane motifs (68 hypothetical genes with motifs derived by SMART) in the de novo isolate hypothetical genes. Output files are isolate.helix_motifs_obtained_gene
echo "membrane.sh is running.........."
sh membrane.sh |& tee hypothetical_membrane_motif_found
############This code calls membrane_analysis.sh############
#Its input file is files generated by runningmembraned.sh and isolate_list. The code uses two while loops. Data is output to the monitor. This code can be executed by changing the isolate_list to lineage_isolates.
echo "membrane_analysis.sh is running.........."
sh membrane_analysis.sh |& tee hypothetical_membrane_motif_pattern_locations
#hypothetical.sh
###############
while read isolate;
do
echo "**********Starting $isolate************* "
#mkdir result_files
#This perl code extracts all protein fasta sequences with the pattern "HYPOTHETICAL' from the proteome of $isolate and outputs into a file $isolate_hypothetical
perl /home/pseema/denovo_analysis/fasta_extraction.pl -m 'hypothetical' /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa > /home/pseema/denovo_analysis/result_files/$isolate.hypothetical
#cat /home/pseema/denovo_analysis/$isolate.hypothetical
#This perl code determines the length of each hypothetical protein sequences and result is directed to a file
perl /home/pseema/denovo_analysis/fasta_length.pl < /home/pseema/denovo_analysis/result_files/$isolate.hypothetical > /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical
#cat /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical
#The lengths are sorted numerically to find the smallest as well as longest protein sequence; result is output to a file
sort -n /home/pseema/denovo_analysis/result_files/length_$isolate.hypothetical > /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical
#cat /home/pseema/denovo_analysis/sorted_length_$isolate.hypothetical
#Total gene numbers and hypothetical protein numbers was generated, percentage was calculated
echo "####Total number of genes in isolate is####"
grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l
grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l > /home/pseema/denovo_analysis/result_files/$isolate.total_number
x=$(grep ">" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l)
#echo "Total number of protein sequence is: "
#echo $x
echo "####Total number of hypothetical protein sequence is####"
grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l
grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l > /home/pseema/denovo_analysis/result_files/$isolate.only_hypothetical
y=$(grep "hypothetical" /home/pseema/denovo_analysis/protein_fasta_files/$isolate.faa | wc -l)
#echo "Total number of hypothetical protein sequence is: : "
#echo $y
percent=$((100*$y/$x))
echo "####The percentage of hypothetical genes is#### "
echo $percent
#cat /home/pseema/denovo_analysis/sorted_length_$isolate.hypothetical
echo "####Amino acid length of the smallest hypothetical protein is####"
cut -f1 -d"," /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | head -1
echo "####Amino acid length of the largest hypothetical protein is####"
cut -f1 -d"," /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | tail -1
echo "####The numbers of hypothetical proteins with length 50 or below are####"
cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=50)print $1}' | sort -n | wc -l
z=$(cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=50)print $1}' | sort -n | wc -l)
echo "####The numbers of hypothetical proteins with length 100 or below are#### "
cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=100)print $1}' | sort -n | wc -l
z=$(cat /home/pseema/denovo_analysis/result_files/sorted_length_$isolate.hypothetical | awk '{if($1==$1+0 && $1<=100)print $1}' | sort -n | wc -l)
#echo "value of z is: "
#echo $z
percentage=$((100*$z/$y))
echo "###The percentage of hypothetical proteins with length less than 100aa is###"
echo $percentage
#mkdir log_files
echo "#################Number of CRISPRs in the $isolate#################"
grep "CRISPR" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "#################Number of spacers in the $isolate#################"
grep "spacers" /home/pseema/denovo_analysis/log_files/$isolate.log | wc -l
echo "#################Number of tRNAs in the $isolate###################"
grep "tRNAs" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "#################Number of Contig in the #isolate###################"
grep "Contig" /home/pseema/denovo_analysis/log_files/$isolate.log
echo "*************$isolate genome done**************"
done < /home/pseema/denovo_analysis/input_files/isolate_list
-----------------------------------
#! /usr/bin
#hypothetical_profile_analysis.sh
#It takes hypothetical_profile as input file
echo "*********Range of total number of genes in the isolates********"
#This code prints the next line following the pattern 'Total number of genes in isolate is' in the input file
awk 'f{print;f=0} /Total number of genes in isolate is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/total_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/total_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_total_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes
echo "******Sorted as per gene numbers*******"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes |sort -k2,2
#Sort, find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_total_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted
echo "Lower limit of genes :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted | head -1
echo "Upperlimit of genes :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_total_genes_sorted | tail -1
echo "*********Range of total number of hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Total number of hypothetical protein sequence is' in the input file
awk 'f{print;f=0} /Total number of hypothetical protein sequence is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/hypothetical_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/hypothetical_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes
echo "******Sorted as per hypothetical protein numbers********"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted
echo "Lower limit of hypothetical proteins :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted | head -1
echo "Upper limit of hypothetical proteins :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_hypothetical_genes_sorted | tail -1
echo "*********Range of percentage of hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'The percentage of hypothetical genes is' in the input file
awk 'f{print;f=0} /The percentage of hypothetical genes is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/percentage_hypothetical_genes
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/percentage_hypothetical_genes | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes
echo "******Sorted as per hypothetical protein percentage*********"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted
echo "Lower limit of hypothetical protein percentage :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted | head -1
echo "Upper limit of hypothetical protein percentage :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_hypothetical_genes_sorted | tail -1
echo "*********Range of smallest hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Amino acid length of the smallest hypothetical protein is' in the input file
awk 'f{print;f=0} /Amino acid length of the smallest hypothetical protein is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/smallest_hypothetical_protein
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/smallest_hypothetical_protein | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein
echo "******Sorted as per smallest hypothetical protein order"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted
echo "Lower limit of smallest protein:"
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted | head -1
echo "Upper limit of smallest protein :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_smallest_hypothetical_protein_sorted | tail -1
echo "*********Range of largest hypothetical proteins in the isolates*********"
#This code prints the next line following the pattern 'Amino acid length of the largest hypothetical protein is' in the input file
awk 'f{print;f=0} /Amino acid length of the largest hypothetical protein is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/largest_hypothetical_protein
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/largest_hypothetical_protein | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein
echo "******Sorted as per largest hypothetical protein order"
#Sorted as per column 2
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein |sort -k2,2
#Sort,find unique
awk -F '\t' '{print $2}' /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein | sort | uniq -c > /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted
echo "Lower limit of largest protein:"
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted | head -1
echo "Upper limit of largest protein :"
cat /home/pseema/denovo_analysis/result_files/isolate_list_largest_hypothetical_protein_sorted | tail -1
echo "*********Range of hypothetical proteins with length less than 50aa in the isolates*********"
#This code prints the next line following the pattern 'The numbers of hypothetical proteins with length 50 or below are' in the input file
awk 'f{print;f=0} /The numbers of hypothetical proteins with length 50 or below are/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/length_less_than_50
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/length_less_than_50 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_50
cat /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_50
#################
echo "*********Range of hypothetical proteins with length less than 100aa in the isolates*********"
#This code prints the next line following the pattern 'The numbers of hypothetical proteins with length 100 or below are' in the input file
awk 'f{print;f=0} /The numbers of hypothetical proteins with length 100 or below are/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/length_less_than_100
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/length_less_than_100 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_100
cat /home/pseema/denovo_analysis/result_files/isolate_list_length_less_than_100
#################
echo "*********Range of percentage of hypothetical proteins with length less than 100aa in the isolates*********"
#This code prints the next line following the pattern 'The percentage of hypothetical proteins with length less than 100aa is' in the input file
awk 'f{print;f=0} /The percentage of hypothetical proteins with length less than 100aa is/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/percentage_length_less_than_100
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/percentage_length_less_than_100 | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_percentage_length_less_than_100
cat /home/pseema/denovo_analysis/result_files/isolate_list_percentage_length_less_than_100
#################
echo "*********Range of number of CRISPRs in the isolates*********"
#This code prints the line with the pattern 'Total number of genes in isolate is' in the input file
awk '/Found/ && /CRISPRs/' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/CRISPR_profile
awk '{print $3}' < /home/pseema/denovo_analysis/result_files/CRISPR_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_CRISPR_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_CRISPR_profile
#################
echo "*********Range of number of spacers in the isolates*********"
#This code prints the next line following the pattern 'Number of spacers in the' in the input file
awk 'f{print;f=0} /Number of spacers in the/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/spacer_profile
awk '{print $1}' < /home/pseema/denovo_analysis/result_files/spacer_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_spacer_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_spacer_profile
#################
echo "*********Range of tRNAs in the isolates*********"
#This code prints the next line following the pattern 'Predicting tRNAs and tmRNAs' in the input file
awk 'f{print;f=0} /Predicting tRNAs and tmRNAs/{f=1}' /home/pseema/denovo_analysis/hypothetical_profile > /home/pseema/denovo_analysis/result_files/tRNA_profile
awk '{print $3}' < /home/pseema/denovo_analysis/result_files/tRNA_profile | paste /home/pseema/denovo_analysis/input_files/isolate_list - > /home/pseema/denovo_analysis/result_files/isolate_list_tRNA_profile
cat /home/pseema/denovo_analysis/result_files/isolate_list_tRNA_profile
#################
--------------------------
#! usr/bin/bash
#This (hypothetical_analysis_wrapper.sh) is wrapper for the all the hypothetical analysis-related codes.
#It analyzes the hypothetical distribution among the 51 denovo + 2 reference strains.
#The details of the input files and paths have been mentioned the README.txt
#########This code calls hypothetical_profile.sh############
#It extracts all critical informations regarding genome, number of hypothetical proteins, their size, percentage of total genome, other genome characteristics.
#The result is directed into a data_file
echo "hypothetical_profile.sh is running.........."
sh hypothetical_profile.sh |& tee hypothetical_profile
cat hypothetical_profile
#########This code calls hypothetical_profile_analysis.sh############
#Its input file is hypothetical_profile. The code analyzes hypothetical protein pattern across isolates
echo "hypothetical_profile_analysis.sh is running.........."
sh hypothetical_profile_analysis.sh |& tee hypothetical_pattern
############This code calls fasta_manipulation.sh############
#Its input file is isolate.faa . The code analyzes smallest functional protein length and name, largest functional protein length and name, ESAT, ESX, transposase, efflux, FabG, antitoxin, ribonuclease, PE, PPE, universal stress protein pattern across isolates.
echo "fasta_manipulation.sh is running.........."
sh fasta_manipulation.sh |& tee fasta_manipulation_data_file
############This code calls fasta_manipulation_analysis.sh############
#Its input file is fasta_manipulation_data_file. This code creates a table of smallest functional protein length, largest functional protein length, country collected from, number of hypothetical proteins, ESAT, ESX, transposase, Efflux, FabG, antitoxin, ribonuclease, PE, PPE, universal stress protein pattern in isolates.
echo "fasta_manipulation_analysis.sh is running.........."
sh fasta_manipulation_analysis.sh
#The output file is final_fasta_manipulation_analysis_file (generated in hypothetical_analysis directory)
############This code calls coiled.sh############
#Its input file is isolate_hypothetical and isolate_list. A while loop is used to find the M. tuberculosis hypothetical gene coiled coil motifs (24 hypothetical geneMY SCRIPT (7): Hypothetical gene profile and their analysis ........MY SCRIPT (7): Hypothetical gene profile and their analysis ........MY SCRIPT (7): Hypothetical gene profile and their analysis ........s with motifs derived by SMART) in the de novo isolate hypothetical genes. Output files are isolate.coiled_motif_found_gene
echo "coiled.sh is running.........."
sh coiled.sh |& tee hypothetical_coiled_motif_found
############This code calls coiled_analysis.sh############
#Its input file is files generated by running coiled.sh and isolate_list. The code uses two while loops. Data is output to the monitor. This code can be executed by changing the isolate_list to lineage_isolates.
echo "coiled_analysis.sh is running.........."
sh coiled_analysis.sh |& tee hypothetical_coiled_motif_pattern_locations
############This code calls membrane.sh############
#Its input file is isolate_hypothetical and isolate_list. A while loop is used to find the M. tuberculosis hypothetical gene membrane motifs (68 hypothetical genes with motifs derived by SMART) in the de novo isolate hypothetical genes. Output files are isolate.helix_motifs_obtained_gene
echo "membrane.sh is running.........."
sh membrane.sh |& tee hypothetical_membrane_motif_found
############This code calls membrane_analysis.sh############
#Its input file is files generated by runningmembraned.sh and isolate_list. The code uses two while loops. Data is output to the monitor. This code can be executed by changing the isolate_list to lineage_isolates.
echo "membrane_analysis.sh is running.........."
sh membrane_analysis.sh |& tee hypothetical_membrane_motif_pattern_locations
No comments:
Post a Comment