Thursday, July 20, 2017

MY SCRIPT (2): Virus, allergen, toxin domain comparison...........


#Script to analyse protein domain of various virus, allergens, toxins, proteins
#Folder viral_protein_fasta is in pollen in Desktop. #It has 15 protein fasta files (randomly taken from UniProt)
mkdir results
#Protein domain analysis
#! usr/bin/bash
#Execute as: sh pollen_protein_domains.sh|& tee pollen_data
while read isolate
do
echo "###$isolate running###.........."
#Remove last two columns
awk '{ $2 = $3= ""; print }'  /home/pseema/Desktop/pollen/pollen_protein_domains/$isolate.fasta > /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta
#Sort file alphabetically
sort -u /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta > /home/pseema/Desktop/pollen/results/$isolate.sorted
#Sort the file to find lines in the order of maximum frequency
sort /home/pseema/Desktop/pollen/results/$isolate.sorted  | uniq -c | sort -n -r > /home/pseema/Desktop/pollen/results/$isolate.sorted_max_freq

done < /home/pseema/Desktop/pollen/isolate_list

#Nested loop starts
while read strain;
do
while read isolate;
do
echo "#################Starting $isolate..####################"
echo "Sorted list of domains for $isolate"
cat  /home/pseema/Desktop/pollen/results/$isolate.sorted
echo "Number of domains for $isolate"
cat  /home/pseema/Desktop/pollen/results/$isolate.sorted | wc -l
#Shows common proteins to file 1 and file2 (option -12 or -21 can be used to achieve it) 
echo "**Domains common to $strain and $isolate: **"
comm -12  /home/pseema/Desktop/pollen/results/$strain.sorted  /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_12
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_12
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_12 | wc -l

#These proteins occur only in $strain (only column1)
echo "**Proteins unique to strain $strain (not in $isolate): **"
comm -23  /home/pseema/Desktop/pollen/results/$strain.sorted  /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_23
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_23
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_23 | wc -l

#These proteins occur only in $isolate (only column2)
echo "**Proteins unique to isolate $isolate (not in $strain): **"
comm -13  /home/pseema/Desktop/pollen/results/$strain.sorted  /home/pseema/Desktop/pollen/results/$isolate.sorted > /home/pseema/Desktop/pollen/results//$isolate.matches_comm_13
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_13
cat /home/pseema/Desktop/pollen/results/$isolate.matches_comm_13 | wc -l

done < /home/pseema/Desktop/pollen/isolate_list
done < /home/pseema/Desktop/pollen/strain_list


cat /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta

#Protein domain data manipulation
#! usr/bin/bash
#Execute as: sh pollen_data_manipulations.sh|& tee pollen_data_analysis
#Print the line following a given pattern
grep -A1 "Number of domains for" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"

grep -A1 "Domains common to" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"

grep -A1 "Proteins unique to strain" /home/pseema/Desktop/pollen/pollen_data
echo "##########################################"

grep -A1 "Proteins unique to isolate" /home/pseema/Desktop/pollen/pollen_data

echo "##########################################"

#Consensus protein domain finding

#! usr/bin/bash

#Execute as: sh pollen_protein_common.sh|& tee pollen_domain_consensus


while read isolate
do
#Remove last two columns
awk '{ $2 = $3= ""; print }'  /home/pseema/Desktop/pollen/pollen_protein_domains/$isolate.fasta > /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta

#Sort file alphabetically
sort -u /home/pseema/Desktop/pollen/results/$isolate.truncated.fasta > /home/pseema/Desktop/pollen/results/$isolate.sorted
done < /home/pseema/Desktop/pollen/isolate_list

#Counts the total number of domains in different ways
cat /home/pseema/Desktop/pollen/results/*$isolate.sorted > /home/pseema/Desktop/pollen/results/all_total_domains 
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains  | sort > /home/pseema/Desktop/pollen/results/all_total_domains_sort
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains  | sort | uniq -c > /home/pseema/Desktop/pollen/results/all_total_domains_sort_uniq
awk -F '\t' '{print $1}' /home/pseema/Desktop/pollen/results/all_total_domains  | sort | uniq -c | sort -nr > /home/pseema/Desktop/pollen/results/all_total_domains_sort_uniq_sortnr

#Counts the unique number of domains
awk '!NF || !seen[$0]++' /home/pseema/Desktop/pollen/results/*$isolate.sorted > /home/pseema/Desktop/pollen/results/all_isolate_random
sort -u /home/pseema/Desktop/pollen/results/all_isolate_random | wc

#cat /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "YARHG" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "WH1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RICTOR_M" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta 
grep "Pro-kuma_activ" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta 
grep "MYSc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "IENR1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HTH_ASNC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta    
grep "FABD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DDHD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DALR_2" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta    
grep "WSN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "VWC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Telomerase_RBD" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta     
grep "RasGAP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PA2c" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "MIT" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "YqgFc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "TLC" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "STI1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta    
grep "RUN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RL11" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "RAP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta     
grep "R3H" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PI3Ka" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PhBP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta    
grep "GMGS" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Lipid_DES" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LIM " /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LamG" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HhH1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HALZ" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Grip" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Glyco_10" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Elp3" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "DEP" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta 
grep "Cyclin_C" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Citrate_ly_lig" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "CAT" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Brr6_like_C_C" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta 
grep "B41" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Y1_Tnp" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep " LIGANc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "IBN_N " /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HOX" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "HOLI" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "PLCYc" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Hr1" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "H4" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "GGDEF" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LPD_N" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "LON" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "Zalpha" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "VWC_out" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta
grep "ALBUMIN" /home/pseema/Desktop/pollen/pollen_protein_domains/*.fasta

#Wrapper
#! usr/bin/bash
#sh pollen_wrapper.sh
sh pollen_protein_domains.sh|& tee pollen_data
sh pollen_data_manipulations.sh|& tee pollen_data_analysis
sh pollen_protein_common.sh|& tee pollen_domain_consensus

No comments:

Post a Comment

Laboratory tools and reagents (Micro-pipettes)...

Micro-pipettes are essential tools of R & D labs, and integral part of Good Laboratory Practices (GLPs) Micro-pipetting methods include ...