#File arrangement in the tool
#setup.cfg
[bdist_wheel]
universal=1
#setup.py
from setuptools import setup
from hypothetical import __version__, __author__, __title__, __license__
def readme():
with open('README.rst') as fin:
return fin.read()
setup(
name=__title__,
version=__version__,
description='A bioinformatic annotation',
long_description=readme(),
#url='https://github.com/.....',
author=__author__,
#author_email='gmail.com',
packages=['rabifier'],
license=__license__,
zip_safe=False,
keywords=['hypothetical, 'annotation'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Natural Language :: English',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.4',
],
install_requires=[
'biopython',
'numpy',
'scipy'
],
setup_requires=[
'numpy'
],
extras_require={
'plotting': ['matplotlib']
},
include_package_data=True,
scripts=[
'bin/rabifier',
'bin/rabifier-mkdb'
]
)
#bin (software, software-mkdb)
#!/usr/bin/env python
if __name__ == '__main__':
import hypothetical.annotation
hypothetical.annotation.main()
------------------------------------------------
#!/usr/bin/env python
if __name__ == '__main__':
import logging
import hypothetical.core
logging.basicConfig(level=logging.INFO)
hypothetical.core.main()
------------------------------------------------
#software (data folder, scripts)
software/data
data folder has .fasta, .meme, .hmm, .json
#MEME is a motif format, json is txt to transmit data objects
#.fasta (use as many fasta seq as needed)
MEME version 4
ALPHABET= ACDEFGHIKLMNPQRSTVWY
Background letter frequencies
A 0.077 C 0.019 D 0.063 E 0.064 F 0.044 G 0.070 H 0.016 I 0.056 K 0.067
L 0.080 M 0.020 N 0.044 P 0.028 Q 0.042 R 0.056 S 0.078 T 0.063 V 0.071
W 0.012 Y 0.030
MOTIF KLGCAY
letter-probability matrix: alength= 20 w= 5 nsites= 1152 E= 6.9e-2134
0.000868 0.000000 0.000000 0.000868
0.002604 0.001736 0.001736 0.000868
MOTIF GERSWY
letter-probability matrix: alength= 20 w= 5 nsites= 1242 E= 7.3e-2599
0.025827 0.000821 0.002466 0.001662 0.000036
0.086214 0.001626 0.000050 0.000052 0.09182
# .json (provide the location info) #an example from other source
e.g.
{
"rab37": {
"hs_location": 110.50251713293918,
"ph_location": 112.86100043738627,
"ph_scale": 11.00197156371245,
"hs_scale": 8.85713380184737
},
"rab36": {
"hs_location": 154.27225307156183,
"ph_location": 153.74148083154392,
"ph_scale": 13.191095803448913,
"hs_scale": 10.547615274982139
},
"DmRabX5": {
"hs_location": 140.0,
"ph_location": 140.02393936245005,
"ph_scale": 9.3399952365444658,
"hs_scale": 6.0
},
"DmRabX4": {
"hs_location": 104.09023474783298,
"ph_location": 92.151361443303443,
"ph_scale": 13.713705239848309,
"hs_scale": 6.8325548548732469
}
}
---------------------------------------------------------
#Python Scripts
__init__.py
config.json
core.py
hypothetical.py
utils.py
#__init__.py
#!/usr/bin/env python
import os
import json
import logging
import tempfile
__title__ = 'hypothetical'
__version__ = '1.0.0'
__author__ = 'Seema Patel'
__license__ = 'xxxx'
__copyright__ = 'Copyright 2016 Seema Patel'
# load configuration
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as fin:
config = json.load(fin)
config['tmp'] = tempfile.gettempdir()
# Configure logging
#logging.basicConfig(level=logging.DEBUG)
logging.getLogger(__name__)
#_core.py
#!/usr/bin/env python
from __future__ import print_function
import os
import json
from collections import defaultdict
import math
import operator
import logging
import argparse
from Bio import SeqIO, AlignIO
import numpy as np
import scipy.stats
try:
import matplotlib.pyplot as plt
except ImportError:
plt = None
from . import __version__
from . import config
from .utils import Pathfinder, run_cmd, run_cmd_if_file_missing, merge_files
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class Seed(object):
def __init__(self, source, **kwargs):
self.source = os.path.abspath(source)
self.pathfinder = Pathfinder(True)
# Temporary folders,
self.path_tmp = kwargs.get('tmp', os.path.join(config['tmp'], 'annotation_tmp'))
for name in ('build', 'test', 'seed'):
path = os.path.join(self.path_tmp, name)
if not os.path.exists(path):
os.makedirs(path)
self.path = {
'non_domain': os.path.join(self.source, 'other', '{}.full'.format(config['seed']['non_domain'])),
'domain_model_manual_override':
os.path.join(self.source, 'other', 'domain_subfamilies_logreg_params.dat'),
'domain': os.path.join(self.source, 'domain', '{}.fasta.full')
}
self.output = {
'domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['domain_db']),
'domain_hmm': os.path.join(self.path_tmp, 'seed', config['seed']['domain_hmm']),
'domain_model': os.path.join(self.path_tmp, 'seed', config['seed']['domain_model']),
'domain_f': os.path.join(self.path_tmp, 'seed', config['seed']['rdomain_f']),
'non_domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['non_domain_db']),
'domain_db_reduced': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.fasta'),
'domain_db_reduced_msa': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.mafft'),
'domain_f_single_motif': os.path.join(self.path_tmp, 'build', 'motif_{}.meme')
}
self.cpu = str(kwargs.get('cpu', config['param']['cpu']))
self.check()
# Load subfamily names that will be considered for the seed database construction
self.domain_subfamilies = config['domain_subfamilies']
def get_subfamily_path(self, subfamily, extension=None):
path = os.path.join(self.path_tmp, 'build', subfamily)
if extension is None:
return path
else:
return '.'.join([path, extension])
def check(self):
for tool in ('cd-hit', 'prank', 'hmmbuild', 'hmmpress', 'hmmscan', 'phmmer', 'mafft', 'meme'):
if not self.pathfinder.exists(tool):
raise RuntimeError("Dependency {} is missing".format(tool))
def __call__(self):
self.build_subfamilies()
self.build_subfamily_models()
self.build_domain_f_models()
self.generate_non_domain()
def dump_db_files(self, destination):
pass
def build_subfamilies(self):
for subfamily in self.domain_subfamilies:
.......................
#find longest ORF in DNA sequence
#(when ATG is the start codon; TAG, TGA, and TAA are stop codons)
import re
max(re.findall(r'ATG(?:(?!TAA|TAG|TGA)...)*(?:TAA|TAG|TGA)',s), key = len)
#Counting DNA motif occurrences
contig="CCCCAAAACCCCAAAACCCCAAAACCCCTAcGAaTCCCcTCATAATTGAAAGACTTAAACTTTAAAACCCTAGAAT"
splitbase="CCCCAAAA"
halfBase="CCCC"
splittedContig=contig.split(splitbase)
cnt=len(splittedContig)-1
print cnt+sum([0.5 for e in splittedContig if e.startswith(halfBase)])
#setup.cfg
[bdist_wheel]
universal=1
#setup.py
from setuptools import setup
from hypothetical import __version__, __author__, __title__, __license__
def readme():
with open('README.rst') as fin:
return fin.read()
setup(
name=__title__,
version=__version__,
description='A bioinformatic annotation',
long_description=readme(),
#url='https://github.com/.....',
author=__author__,
#author_email='gmail.com',
packages=['rabifier'],
license=__license__,
zip_safe=False,
keywords=['hypothetical, 'annotation'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Natural Language :: English',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.4',
],
install_requires=[
'biopython',
'numpy',
'scipy'
],
setup_requires=[
'numpy'
],
extras_require={
'plotting': ['matplotlib']
},
include_package_data=True,
scripts=[
'bin/rabifier',
'bin/rabifier-mkdb'
]
)
#bin (software, software-mkdb)
#!/usr/bin/env python
if __name__ == '__main__':
import hypothetical.annotation
hypothetical.annotation.main()
------------------------------------------------
#!/usr/bin/env python
if __name__ == '__main__':
import logging
import hypothetical.core
logging.basicConfig(level=logging.INFO)
hypothetical.core.main()
------------------------------------------------
#software (data folder, scripts)
software/data
data folder has .fasta, .meme, .hmm, .json
#MEME is a motif format, json is txt to transmit data objects
#.fasta (use as many fasta seq as needed)
>a3___16870SLLIFLENEVRERLGLDSIKTHKWTILPCLEWVVQDAKD>b6___145304 PTYTMADQTPDSWEDELSRQTFTPGAASFVPGQA
#.hmm
#.meme (use as many peptide motifs as needed)MEME version 4
ALPHABET= ACDEFGHIKLMNPQRSTVWY
Background letter frequencies
A 0.077 C 0.019 D 0.063 E 0.064 F 0.044 G 0.070 H 0.016 I 0.056 K 0.067
L 0.080 M 0.020 N 0.044 P 0.028 Q 0.042 R 0.056 S 0.078 T 0.063 V 0.071
W 0.012 Y 0.030
MOTIF KLGCAY
letter-probability matrix: alength= 20 w= 5 nsites= 1152 E= 6.9e-2134
0.000868 0.000000 0.000000 0.000868
0.002604 0.001736 0.001736 0.000868
MOTIF GERSWY
letter-probability matrix: alength= 20 w= 5 nsites= 1242 E= 7.3e-2599
0.025827 0.000821 0.002466 0.001662 0.000036
0.086214 0.001626 0.000050 0.000052 0.09182
# .json (provide the location info) #an example from other source
e.g.
{
"rab37": {
"hs_location": 110.50251713293918,
"ph_location": 112.86100043738627,
"ph_scale": 11.00197156371245,
"hs_scale": 8.85713380184737
},
"rab36": {
"hs_location": 154.27225307156183,
"ph_location": 153.74148083154392,
"ph_scale": 13.191095803448913,
"hs_scale": 10.547615274982139
},
"DmRabX5": {
"hs_location": 140.0,
"ph_location": 140.02393936245005,
"ph_scale": 9.3399952365444658,
"hs_scale": 6.0
},
"DmRabX4": {
"hs_location": 104.09023474783298,
"ph_location": 92.151361443303443,
"ph_scale": 13.713705239848309,
"hs_scale": 6.8325548548732469
}
}
---------------------------------------------------------
#Python Scripts
__init__.py
config.json
core.py
hypothetical.py
utils.py
#__init__.py
#!/usr/bin/env python
import os
import json
import logging
import tempfile
__title__ = 'hypothetical'
__version__ = '1.0.0'
__author__ = 'Seema Patel'
__license__ = 'xxxx'
__copyright__ = 'Copyright 2016 Seema Patel'
# load configuration
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as fin:
config = json.load(fin)
config['tmp'] = tempfile.gettempdir()
# Configure logging
#logging.basicConfig(level=logging.DEBUG)
logging.getLogger(__name__)
#_core.py
#!/usr/bin/env python
from __future__ import print_function
import os
import json
from collections import defaultdict
import math
import operator
import logging
import argparse
from Bio import SeqIO, AlignIO
import numpy as np
import scipy.stats
try:
import matplotlib.pyplot as plt
except ImportError:
plt = None
from . import __version__
from . import config
from .utils import Pathfinder, run_cmd, run_cmd_if_file_missing, merge_files
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class Seed(object):
def __init__(self, source, **kwargs):
self.source = os.path.abspath(source)
self.pathfinder = Pathfinder(True)
# Temporary folders,
self.path_tmp = kwargs.get('tmp', os.path.join(config['tmp'], 'annotation_tmp'))
for name in ('build', 'test', 'seed'):
path = os.path.join(self.path_tmp, name)
if not os.path.exists(path):
os.makedirs(path)
self.path = {
'non_domain': os.path.join(self.source, 'other', '{}.full'.format(config['seed']['non_domain'])),
'domain_model_manual_override':
os.path.join(self.source, 'other', 'domain_subfamilies_logreg_params.dat'),
'domain': os.path.join(self.source, 'domain', '{}.fasta.full')
}
self.output = {
'domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['domain_db']),
'domain_hmm': os.path.join(self.path_tmp, 'seed', config['seed']['domain_hmm']),
'domain_model': os.path.join(self.path_tmp, 'seed', config['seed']['domain_model']),
'domain_f': os.path.join(self.path_tmp, 'seed', config['seed']['rdomain_f']),
'non_domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['non_domain_db']),
'domain_db_reduced': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.fasta'),
'domain_db_reduced_msa': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.mafft'),
'domain_f_single_motif': os.path.join(self.path_tmp, 'build', 'motif_{}.meme')
}
self.cpu = str(kwargs.get('cpu', config['param']['cpu']))
self.check()
# Load subfamily names that will be considered for the seed database construction
self.domain_subfamilies = config['domain_subfamilies']
def get_subfamily_path(self, subfamily, extension=None):
path = os.path.join(self.path_tmp, 'build', subfamily)
if extension is None:
return path
else:
return '.'.join([path, extension])
def check(self):
for tool in ('cd-hit', 'prank', 'hmmbuild', 'hmmpress', 'hmmscan', 'phmmer', 'mafft', 'meme'):
if not self.pathfinder.exists(tool):
raise RuntimeError("Dependency {} is missing".format(tool))
def __call__(self):
self.build_subfamilies()
self.build_subfamily_models()
self.build_domain_f_models()
self.generate_non_domain()
def dump_db_files(self, destination):
pass
def build_subfamilies(self):
for subfamily in self.domain_subfamilies:
.......................
#find longest ORF in DNA sequence
#(when ATG is the start codon; TAG, TGA, and TAA are stop codons)
import re
max(re.findall(r'ATG(?:(?!TAA|TAG|TGA)...)*(?:TAA|TAG|TGA)',s), key = len)
#Counting DNA motif occurrences
contig="CCCCAAAACCCCAAAACCCCAAAACCCCTAcGAaTCCCcTCATAATTGAAAGACTTAAACTTTAAAACCCTAGAAT"
splitbase="CCCCAAAA"
halfBase="CCCC"
splittedContig=contig.split(splitbase)
cnt=len(splittedContig)-1
print cnt+sum([0.5 for e in splittedContig if e.startswith(halfBase)])
No comments:
Post a Comment