#Script to analyse protein domain of various virus, allergens, toxins, proteins
#Folder viral_protein_fasta is in pollen in Desktop. #It has 15 protein fasta files (randomly taken from UniProt)
mkdir results
#Protein domain analysis
#! usr/bin/bash
#Execute as: sh pollen_protein_domains.sh|& tee pollen_data
while read isolate
do
echo "###$isolate running###.........."
#Remove last two columns
awk '{ $2 = $3= ""; print }' /home/pseema/Desktop/pollen/pollen_protein_domains/$isolate.fasta > /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta
#Sort file alphabetically
sort -u /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta > /home/pseema/Desktop/pollen/results/$isolate.sorted
#Sort the file to find lines in the order of maximum frequency
sort /home/pseema/Desktop/pollen/results/$isolate.sorted | uniq -c | sort -n -r > /home/pseema/Desktop/pollen/results/$isolate.sorted_max_freq
done < /home/pseema/Desktop/pollen/isolate_list
#Nested loop starts
while read strain;
do
while read isolate;
do
echo "#################Starting $isolate..####################"
echo "Sorted list of domains for $isolate"
cat /home/pseema/Desktop/pollen/results/$isolate.sorted
echo "Number of domains for $isolate"
cat /home/pseema/Desktop/pollen/results/$isolate.sorted | wc -l
#Shows common proteins to file 1 and file2 (option -12 or -21 can be used to achieve it)
echo "**Domains common to $strain and $isolate: **"
comm -12 /home/pseema/Desktop/pollen/results/$strain.sorted /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_12
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_12
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_12 | wc -l
#These proteins occur only in $strain (only column1)
echo "**Proteins unique to strain $strain (not in $isolate): **"
comm -23 /home/pseema/Desktop/pollen/results/$strain.sorted /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_23
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_23
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_23 | wc -l
#These proteins occur only in $isolate (only column2)
echo "**Proteins unique to isolate $isolate (not in $strain): **"
comm -13 /home/pseema/Desktop/pollen/results/$strain.sorted /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_13
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_13
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_13 | wc -l
done < /home/pseema/Desktop/pollen/isolate_list
done < /home/pseema/Desktop/pollen/strain_list
cat /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
#Protein domain data manipulation
#! usr/bin/bash
#Execute as: sh pollen_data_manipulations.sh|& tee pollen_data_analysis
#Print the line following a given pattern
grep -A1 "Number of domains for" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"
grep -A1 "Domains common to" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"
grep -A1 "Proteins unique to strain" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"
grep -A1 "Proteins unique to isolate" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"
#Consensus protein domain finding
#! usr/bin/bash
#Execute as: sh pollen_protein_common.sh|& tee pollen_domain_consensus
while read isolate
do
#Remove last two columns
awk '{ $2 = $3= ""; print }' /home/pseema/Desktop/pollen/pollen_protein_domains/$isolate.fasta > /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta
#Sort file alphabetically
sort -u /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta > /home/pseema/Desktop/pollen/results/$isolate.sorted
done < /home/pseema/Desktop/pollen/isolate_list
#Counts the total number of domains in different ways
cat /home/pseema/Desktop/pollen/results/*$isolate.sorted > /home/pseema/Desktop/pollen/results/all_total_domains
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains | sort > /home/pseema/Desktop/pollen/results/all_total_domains_sort
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains | sort | uniq -c > /home/pseema/Desktop/pollen/results/all_total_domains_sort_uniq
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains | sort | uniq -c | sort -nr > /home/pseema/Desktop/pollen/results/all_total_domains_sort_uniq_sortnr
#Counts the unique number of domains
awk '!NF || !seen[$0]++' /home/pseema/Desktop/pollen/results/*$isolate.sorted > /home/pseema/Desktop/pollen/results/all_isolate_random
sort -u /home/pseema/Desktop/pollen/results/all_isolate_random | wc
#cat /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "YARHG" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "WH1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RICTOR_M" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Pro-kuma_activ" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "MYSc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "IENR1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HTH_ASNC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "FABD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DDHD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DALR_2" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "WSN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "VWC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Telomerase_RBD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RasGAP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PA2c" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "MIT" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "YqgFc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "TLC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "STI1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RUN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RL11" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RAP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "R3H" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PI3Ka" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PhBP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "GMGS" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Lipid_DES" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LIM " /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LamG" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HhH1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HALZ" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Grip" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Glyco_10" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Elp3" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DEP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Cyclin_C" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Citrate_ly_lig" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "CAT" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Brr6_like_C_C" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "B41" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Y1_Tnp" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep " LIGANc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "IBN_N " /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HOX" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HOLI" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PLCYc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Hr1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "H4" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "GGDEF" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LPD_N" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LON" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Zalpha" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "VWC_out" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "ALBUMIN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
#Wrapper
#! usr/bin/bash
#sh pollen_wrapper.sh
sh pollen_protein_domains.sh|& tee pollen_data
sh pollen_data_manipulations.sh|& tee pollen_data_analysis
sh pollen_protein_common.sh|& tee pollen_domain_consensus
No comments:
Post a Comment