HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

FASTA file processing using Python to invoke external filters

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
fastafilepythonusingexternalfiltersinvokeprocessing

Problem

I am very new to programming and this is my first functional code. It works fine but I'm sure that I could use a lot of optimization. If you see any blunders or would be able to help condense the script that would be fantastic.

```
#!/usr/bin/python

import sys, getopt, subprocess, os, tempfile, shutil, time

file_name = sys.argv[2]
pwd = os.getcwd() + "/"
dirname = pwd + "Secretome_files"
file_location = dirname + '/'

try:
os.makedirs(dirname)
except OSError:
if os.path.exists(dirname):
pass
else:
raise

def singleline():
print "\nMaking fasta single line"
file_in = sys.argv[1]
file_out = open(file_location + file_name + "singleline.fasta", "w")
command = ("fasta_formatter -i " + file_in + " -w 0")
p1 = subprocess.Popen((command), stdout=file_out, shell=True)
p1.wait()
print "Fasta now single line"

def signalp():
singleline()
command = ("signalp -f short -m " + file_location + file_name + "removed_SigPep.fasta " + file_location + file_name + "singleline.fasta > "+ file_location + file_name + "signalpOUT.txt")
print "\nRunning SignalP"
signalpRUN = subprocess.Popen([command], shell=True)
signalpRUN.wait()
print "SignalP Complete"

print "\nCreating SignalP protein list"
command2 = ("fasta_formatter -i " + file_location + file_name + "removed_SigPep.fasta -t")
file_out2 = open(file_location + file_name + "removed_SigPep_tab.fasta.txt", "w")
tab = subprocess.Popen([command2], stdout=file_out2, shell=True)
tab.wait()

command3 = ("cut -f1,1 " + file_location + file_name + "removed_SigPep_tab.fasta.txt")
file_out3 = open(file_location + file_name + "listaftercut.txt", "w")
file_out4 = open(file_location + file_name + "goodlistSigP.txt", "w")
listGood = subprocess.Popen([command3], stdout=file_out3, shell=True)
listGood.wait()
openfile = open(file_location + file_name + "listaftercut.txt", 'r')
for line in

Solution

#!/usr/bin/python

import sys, getopt, subprocess, os, tempfile, shutil, time

file_name = sys.argv[2]
pwd = os.getcwd() + "/"
dirname = pwd + "Secretome_files"
file_location = dirname + '/'


Use the function os.path.join to create paths rather then adding the / yourself.

try:
    os.makedirs(dirname)
except OSError:
    if os.path.exists(dirname):
        pass
    else:
        raise


Instead do something like this:

except OSError as error:
    if error.errno != errno.ENOENT:
        raise


This way you check the error code, which can already tell you whether it failed because the file already existed.

def singleline():       
    print "\nMaking fasta single line"
    file_in = sys.argv[1]


I recommend passing command line arguments in as parameters rather then fetching them here.

file_out = open(file_location + file_name + "singleline.fasta", "w")


You should really close this file after you are done with it

command = ("fasta_formatter -i " + file_in + " -w 0")


Those parens do nothing

p1 = subprocess.Popen((command), stdout=file_out, shell=True)


The parens do nothing here.

p1.wait()


You could use the function subprocess.check_call, which will take care of doing the wait, it'll also raise an exception if the program has an error which might be useful.

print "Fasta now single line"

def signalp():
    singleline()


Stylistically, I'd have the function singline return the location of its output file, and then use that here.

command = ("signalp -f short -m " + file_location + file_name + "removed_SigPep.fasta " + file_location + file_name + "singleline.fasta > "+ file_location + file_name + "signalpOUT.txt")


Adding strings isn't very efficient. For subprocess, it makes the most sense to create a list of parameters and pass that the Popen constructor.

print "\nRunning SignalP"   
    signalpRUN = subprocess.Popen([command], shell=True)


Why do you use the > syntax here where you passed a file object earlier? I'd recommend being consistent.

signalpRUN.wait()   
    print "SignalP Complete"

    print "\nCreating SignalP protein list"
    command2 = ("fasta_formatter -i " + file_location + file_name + "removed_SigPep.fasta -t")
    file_out2 = open(file_location + file_name + "removed_SigPep_tab.fasta.txt", "w")
    tab = subprocess.Popen([command2], stdout=file_out2, shell=True)
    tab.wait()


Your doing this basic idea several times. Write a function that takes the command line and the output file and does the execution.

command3 = ("cut -f1,1 " + file_location + file_name + "removed_SigPep_tab.fasta.txt")
    file_out3 = open(file_location + file_name + "listaftercut.txt", "w")   
    file_out4 = open(file_location + file_name + "goodlistSigP.txt", "w")


Wait to open this until you are using it

listGood = subprocess.Popen([command3], stdout=file_out3, shell=True)
    listGood.wait()
    openfile = open(file_location + file_name + "listaftercut.txt", 'r')


I suggest using the with open() as file: syntax to make sure files get closed.

for line in openfile:
        goodname = line.partition(' ')[0] + '\n'
        file_out4.write(goodname)


The output for this file would be better in its own function.

def sigpFasta():    
    command4 = ("faSomeRecords " + file_location + file_name + "singleline.fasta " + file_location + file_name + "goodlistSigP.txt " + file_location + file_name + "signalP_pass.fasta")
    print "\nRetreving SignalP fasta"   
    fastaRUN = subprocess.Popen([command4], shell=True) 
    fastaRUN.wait()

def tmhmm(): 
    command = ("tmhmm " + file_location + file_name + "removed_SigPep.fasta")
    file_out = open(file_location + file_name + "tmhmmOUT.txt", "w")
    print "\nRunning tmhmm on mature signalp sequences only"
    tmhmmRUN = subprocess.Popen([command], stdout=file_out, shell=True)
    tmhmmRUN.wait()


Another place to use that function. Also, rather then all the print chatter, I suggest having it print the commands its executing.

print "tmhmm complete"
    print "\nIdentifying sequences without tm regions."
    openfile = open(file_location + file_name + "tmhmmOUT.txt", "r")
    file_out2 = open(file_location + file_name + "tmhmmGoodlist.txt", "a")
    for line in openfile:
        if "\tPredHel=0\t" in line:                 
            goodname = line.partition('\t')[0] + '\n'
            file_out2.write(goodname)


Rather then reopenning and appending to Goodlist a bunch of times, I suggest one function that opens each of the files you pull from and writes them. That way you only have to open the Godolist once.

```
def targetp():
command = ("targetp -N " + file_location + file_name + "signalP_pass.fasta")
file_out = open(file_location + file_name + "targetpOUT.txt", "w")
print "\nRunning TargetP on SignalP pass seqeunces only"
targetpRUN = subprocess.Popen([command], stdout=file_out, shell=True)
ta

Code Snippets

#!/usr/bin/python

import sys, getopt, subprocess, os, tempfile, shutil, time

file_name = sys.argv[2]
pwd = os.getcwd() + "/"
dirname = pwd + "Secretome_files"
file_location = dirname + '/'
try:
    os.makedirs(dirname)
except OSError:
    if os.path.exists(dirname):
        pass
    else:
        raise
except OSError as error:
    if error.errno != errno.ENOENT:
        raise
def singleline():       
    print "\nMaking fasta single line"
    file_in = sys.argv[1]
file_out = open(file_location + file_name + "singleline.fasta", "w")

Context

StackExchange Code Review Q#19695, answer score: 6

Revisions (0)

No revisions yet.