Friday, July 21, 2017

Language: Python (tools for bio problem).....

#File arrangement in the tool

#setup.cfg
[bdist_wheel]
universal=1
#setup.py
from setuptools import setup
from hypothetical import __version__, __author__, __title__, __license__
def readme():
    with open('README.rst') as fin:
        return fin.read()
setup(
    name=__title__,
    version=__version__,
    description='A bioinformatic annotation',
    long_description=readme(),
    #url='https://github.com/.....',
    author=__author__,
    #author_email='gmail.com',
    packages=['rabifier'],
    license=__license__,
    zip_safe=False,
    keywords=['hypothetical, 'annotation'],
    classifiers=[
        'Development Status :: 5 - Production/Stable',
        'Intended Audience :: Science/Research',
        'Topic :: Scientific/Engineering :: Bio-Informatics',
        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
        'Natural Language :: English',
        'Programming Language :: Python :: 2.7',
        'Programming Language :: Python :: 3.4',
    ],
    install_requires=[
        'biopython',
        'numpy',
        'scipy'
    ],
    setup_requires=[
        'numpy'
    ],
    extras_require={
        'plotting': ['matplotlib']
    },
    include_package_data=True,
    scripts=[
        'bin/rabifier',
        'bin/rabifier-mkdb'
    ]
)

#bin (software, software-mkdb)
#!/usr/bin/env python
if __name__ == '__main__':
    import hypothetical.annotation
    hypothetical.annotation.main()

------------------------------------------------
#!/usr/bin/env python
if __name__ == '__main__':
    import logging
    import   hypothetical.core
    logging.basicConfig(level=logging.INFO)
      hypothetical.core.main()
------------------------------------------------
#software (data folder, scripts)
software/data
data folder has .fasta, .meme, .hmm, .json
#MEME is a motif format, json is txt to transmit data objects
#.fasta (use as many fasta seq as needed)
>a3___16870
SLLIFLENEVRERLGLDSIKTHKWTILPCLEWVVQDAKD
>b6___145304 PTYTMADQTPDSWEDELSRQTFTPGAASFVPGQA
#.hmm

#.meme (use as many peptide motifs as needed)
MEME version 4
ALPHABET= ACDEFGHIKLMNPQRSTVWY
Background letter frequencies
A 0.077 C 0.019 D 0.063 E 0.064 F 0.044 G 0.070 H 0.016 I 0.056 K 0.067
L 0.080 M 0.020 N 0.044 P 0.028 Q 0.042 R 0.056 S 0.078 T 0.063 V 0.071
W 0.012 Y 0.030
MOTIF KLGCAY
letter-probability matrix: alength= 20 w= 5 nsites= 1152 E= 6.9e-2134
0.000868  0.000000  0.000000  0.000868
0.002604  0.001736  0.001736  0.000868
MOTIF GERSWY
letter-probability matrix: alength= 20 w= 5 nsites= 1242 E= 7.3e-2599
0.025827  0.000821  0.002466  0.001662  0.000036

0.086214  0.001626  0.000050  0.000052  0.09182
# .json  (provide the location info) #an example from other source
e.g.
{
  "rab37": {
    "hs_location": 110.50251713293918,
    "ph_location": 112.86100043738627,
    "ph_scale": 11.00197156371245,
    "hs_scale": 8.85713380184737
  },
  "rab36": {
    "hs_location": 154.27225307156183,
    "ph_location": 153.74148083154392,
    "ph_scale": 13.191095803448913,
    "hs_scale": 10.547615274982139
 },
 "DmRabX5": {
    "hs_location": 140.0,
    "ph_location": 140.02393936245005,
    "ph_scale": 9.3399952365444658,
    "hs_scale": 6.0
  },
  "DmRabX4": {
    "hs_location": 104.09023474783298,
    "ph_location": 92.151361443303443,
    "ph_scale": 13.713705239848309,
    "hs_scale": 6.8325548548732469
  }
}
---------------------------------------------------------
#Python Scripts
__init__.py
config.json
core.py
hypothetical.py
utils.py

#__init__.py
#!/usr/bin/env python
import os
import json
import logging
import tempfile

__title__ = 'hypothetical'
__version__ = '1.0.0'
__author__ = 'Seema Patel'
__license__ = 'xxxx'
__copyright__ = 'Copyright 2016 Seema Patel'

# load configuration
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as fin:
    config = json.load(fin)
config['tmp'] = tempfile.gettempdir()

# Configure logging
#logging.basicConfig(level=logging.DEBUG)
logging.getLogger(__name__)

#_core.py
#!/usr/bin/env python
from __future__ import print_function
import os
import json
from collections import defaultdict
import math
import operator
import logging
import argparse

from Bio import SeqIO, AlignIO
import numpy as np
import scipy.stats
try:
    import matplotlib.pyplot as plt
except ImportError:
    plt = None

from . import __version__
from . import config
from .utils import Pathfinder, run_cmd, run_cmd_if_file_missing, merge_files

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class Seed(object):
    
    def __init__(self, source, **kwargs):
        self.source = os.path.abspath(source)
        self.pathfinder = Pathfinder(True)

        # Temporary folders,
        self.path_tmp = kwargs.get('tmp', os.path.join(config['tmp'], 'annotation_tmp'))
        for name in ('build', 'test', 'seed'):
            path = os.path.join(self.path_tmp, name)
            if not os.path.exists(path):
                os.makedirs(path)
        self.path = {
            'non_domain': os.path.join(self.source, 'other', '{}.full'.format(config['seed']['non_domain'])),
            'domain_model_manual_override':
                os.path.join(self.source, 'other', 'domain_subfamilies_logreg_params.dat'),
            'domain': os.path.join(self.source, 'domain', '{}.fasta.full')
        }
        self.output = {
            'domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['domain_db']),
            'domain_hmm': os.path.join(self.path_tmp, 'seed', config['seed']['domain_hmm']),
            'domain_model': os.path.join(self.path_tmp, 'seed', config['seed']['domain_model']),
            'domain_f': os.path.join(self.path_tmp, 'seed', config['seed']['rdomain_f']),
            'non_domain_db': os.path.join(self.path_tmp, 'seed', config['seed']['non_domain_db']),
            'domain_db_reduced': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.fasta'),
            'domain_db_reduced_msa': os.path.join(self.path_tmp, 'build', 'domain_db_reduced.mafft'),
            'domain_f_single_motif': os.path.join(self.path_tmp, 'build', 'motif_{}.meme')
        }
      
        self.cpu = str(kwargs.get('cpu', config['param']['cpu']))        
        self.check()

        # Load subfamily names that will be considered for the seed database construction
        self.domain_subfamilies = config['domain_subfamilies']

    def get_subfamily_path(self, subfamily, extension=None):
        path = os.path.join(self.path_tmp, 'build', subfamily)
        if extension is None:
            return path
        else:
            return '.'.join([path, extension])

    def check(self):

        for tool in ('cd-hit', 'prank', 'hmmbuild', 'hmmpress', 'hmmscan', 'phmmer', 'mafft', 'meme'):
            if not self.pathfinder.exists(tool):
                raise RuntimeError("Dependency {} is missing".format(tool))

    def __call__(self):     

        self.build_subfamilies()
        self.build_subfamily_models()
        self.build_domain_f_models()
        self.generate_non_domain()

    def dump_db_files(self, destination):    

        pass
    def build_subfamilies(self):
        for subfamily in self.domain_subfamilies:
           .......................
#find longest ORF in DNA sequence
#(when ATG is the start codon; TAG, TGA, and TAA are stop codons)
import re
max(re.findall(r'ATG(?:(?!TAA|TAG|TGA)...)*(?:TAA|TAG|TGA)',s), key = len)
 #Counting DNA motif occurrences
contig="CCCCAAAACCCCAAAACCCCAAAACCCCTAcGAaTCCCcTCATAATTGAAAGACTTAAACTTTAAAACCCTAGAAT"
splitbase="CCCCAAAA"
halfBase="CCCC"
splittedContig=contig.split(splitbase)
cnt=len(splittedContig)-1

print cnt+sum([0.5 for e in splittedContig if e.startswith(halfBase)])

No comments:

Post a Comment

Laboratory tools and reagents (Micro-pipettes)...

Micro-pipettes are essential tools of R & D labs, and integral part of Good Laboratory Practices (GLPs) Micro-pipetting methods include ...