patternpythonMinor
Find allele frequencies at each site for each iteration for each population from FASTA file
Viewed 0 times
frequenciesfastafileeachiterationallelesiteforfindpopulation
Problem
The script takes a FASTA format file in input and outputs the frequencies of each amino acid (
My goal (just to restate it) is to get the frequencies of each amino acid (
```
from __future__ import division
import os
from Bio import SeqIO
import itertools
import re
import sys
os.chdir("/path/to/directory/")
#### ReFormat the data ####
maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse("0.0.1.fasta", "fasta")):
if pos == 0:
maxsite = len(i.seq)
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
maxit = max(v[0]+1, maxit)
maxpop = max(v[1]+1, maxpop)
maxind = max(v[2]+1, maxind)
maxlocus = max(v[3]+1, maxlocus)
all = [[[[ "x" * maxsite for locus in xrange(maxlocus)] for ind in xrange(maxind)] for pop in xrange(maxpop)] for it in xrange(maxit)]
for i in SeqIO.parse("0.0.1.fasta", "fasta"):
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
it = v[0]
pop = v[1]
ind = v[2]
locus = v[3]
all[it][pop][ind][locus] = str(i.seq)
#### Calculate the frequencies ####
pA = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pC = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pG = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pT = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
for it in xrange(maxit):
for pop in xrange(maxpop):
for locus in xrange(maxlocus):
A, C, T and G) at each site per population per iteration. The name of each sequence looks like >it0pop1ind0locus0. In this example, this indicates that the sequence that follows is the DNA sequence for the zeroth iteration (it), the first population (pop), the zeroth individual (ind) at the zeroth locus (locus).My goal (just to restate it) is to get the frequencies of each amino acid (
A, C, T and G) at each site per population per iteration.```
from __future__ import division
import os
from Bio import SeqIO
import itertools
import re
import sys
os.chdir("/path/to/directory/")
#### ReFormat the data ####
maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse("0.0.1.fasta", "fasta")):
if pos == 0:
maxsite = len(i.seq)
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
maxit = max(v[0]+1, maxit)
maxpop = max(v[1]+1, maxpop)
maxind = max(v[2]+1, maxind)
maxlocus = max(v[3]+1, maxlocus)
all = [[[[ "x" * maxsite for locus in xrange(maxlocus)] for ind in xrange(maxind)] for pop in xrange(maxpop)] for it in xrange(maxit)]
for i in SeqIO.parse("0.0.1.fasta", "fasta"):
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
it = v[0]
pop = v[1]
ind = v[2]
locus = v[3]
all[it][pop][ind][locus] = str(i.seq)
#### Calculate the frequencies ####
pA = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pC = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pG = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pT = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
for it in xrange(maxit):
for pop in xrange(maxpop):
for locus in xrange(maxlocus):
Solution
let me choose
First things first; your code is
The simplest fix is to accept an argument on the command line, as
numpy
What you're doing looks a lil' bit crazy. To deal with highly-dimensional data, Numpy is often a good candidate.
A first-pass would look something like
Then one can use Numpy tools like fancy slicing
Note that
is pointless as you don't use
is just
Then one can skip the pre-initialization (
This allows removing even more of the code.
intermission
Don't use
finishing up
I feel this is a bit awkward still, but it's nicer than before.
```
from __future__ import division
import os
import re
import sys
import numpy
from Bio import SeqIO
def parse_id(string_id):
return tuple(map(int, re.findall(r'\d+', string_id)))[:3]
def main():
filename = sys.argv[1]
data = list(SeqIO.parse(filename, "fasta"))
header_keys = numpy.array([parse_id(datum.id) for datum in data])
maxit, maxpop, maxind = header_keys.max(axis=0) + 1
maxsite = len(data[0].seq)
sequences = numpy.full((maxit, maxpop, maxind, maxsite), '\0', dtype="S")
for header_key, sequence in zip(header_keys, data):
sequences[tuple(header_key)] = sequence
# Calculate frequencies
pA = (sequences == "A").mean(axis=-2)
pT = (sequences == "T").mean(axis=-2)
pC = (sequences == "C").mean(axis=-2)
First things first; your code is
chdiring to /path/to/directory/ and opening 0.0.1.fasta. This is woefully inflexible.The simplest fix is to accept an argument on the command line, as
sys.argv[1]. (Don't use chdir.)numpy
What you're doing looks a lil' bit crazy. To deal with highly-dimensional data, Numpy is often a good candidate.
A first-pass would look something like
from __future__ import division
import os
from Bio import SeqIO
import itertools
import re
import sys
import numpy
#### ReFormat the data ####
filename = sys.argv[1]
maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse(filename, "fasta")):
print(i.seq)
if pos == 0:
maxsite = len(i.seq)
it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
maxit = max(it+1, maxit)
maxpop = max(pop+1, maxpop)
maxind = max(ind+1, maxind)
maxlocus = max(locus+1, maxlocus)
all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")
for i in SeqIO.parse(filename, "fasta"):
it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
all[it, pop, ind, locus] = i.seq
print(all)
print("---")
#### Calculate the frequencies ####
pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)
for it in xrange(maxit):
for pop in xrange(maxpop):
for locus in xrange(maxlocus):
for site in xrange(maxsite):
x = []
for ind in xrange(maxind):
x.append(all[it, pop, ind, locus, site])
pA[it, pop, site] = x.count("A") / maxind
pT[it, pop, site] = x.count("T") / maxind
pC[it, pop, site] = x.count("C") / maxind
pG[it, pop, site] = x.count("G") / maxind
print(pA)
print(pC)
print(pG)
print(pT)Then one can use Numpy tools like fancy slicing
from __future__ import division
import os
import re
import sys
import numpy
from Bio import SeqIO
def parse_id(string_id):
return tuple(map(int, re.findall(r'\d+', string_id)))
def main():
filename = sys.argv[1]
data = list(SeqIO.parse(filename, "fasta"))
header_keys = numpy.array([parse_id(datum.id) for datum in data])
maxit, maxpop, maxind, maxlocus = header_keys.max(axis=0) + 1
maxsite = len(data[0].seq)
all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")
for header_key, sequence in zip(header_keys, data):
all[tuple(header_key)] = sequence
#### Calculate the frequencies ####
pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)
for locus in xrange(maxlocus):
for site in xrange(maxsite):
x = all[..., locus, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)
print(pA)
print(pC)
print(pG)
print(pT)
main()Note that
for site in xrange(maxsite):is pointless as you don't use
site and the effect is idempotent. Note then thatfor locus in xrange(maxlocus):
x = all[..., locus, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)is just
x = all[..., -1, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)Then one can skip the pre-initialization (
pA = numpy.full(...)) and just dox = all[..., -1, :]
pA = (x == "A").mean(axis=-2)
pT = (x == "T").mean(axis=-2)
pC = (x == "C").mean(axis=-2)
pG = (x == "G").mean(axis=-2)This allows removing even more of the code.
intermission
Don't use
all as a variable name; it clashes with the built-in (same for id).finishing up
I feel this is a bit awkward still, but it's nicer than before.
```
from __future__ import division
import os
import re
import sys
import numpy
from Bio import SeqIO
def parse_id(string_id):
return tuple(map(int, re.findall(r'\d+', string_id)))[:3]
def main():
filename = sys.argv[1]
data = list(SeqIO.parse(filename, "fasta"))
header_keys = numpy.array([parse_id(datum.id) for datum in data])
maxit, maxpop, maxind = header_keys.max(axis=0) + 1
maxsite = len(data[0].seq)
sequences = numpy.full((maxit, maxpop, maxind, maxsite), '\0', dtype="S")
for header_key, sequence in zip(header_keys, data):
sequences[tuple(header_key)] = sequence
# Calculate frequencies
pA = (sequences == "A").mean(axis=-2)
pT = (sequences == "T").mean(axis=-2)
pC = (sequences == "C").mean(axis=-2)
Code Snippets
from __future__ import division
import os
from Bio import SeqIO
import itertools
import re
import sys
import numpy
#### ReFormat the data ####
filename = sys.argv[1]
maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse(filename, "fasta")):
print(i.seq)
if pos == 0:
maxsite = len(i.seq)
it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
maxit = max(it+1, maxit)
maxpop = max(pop+1, maxpop)
maxind = max(ind+1, maxind)
maxlocus = max(locus+1, maxlocus)
all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")
for i in SeqIO.parse(filename, "fasta"):
it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
all[it, pop, ind, locus] = i.seq
print(all)
print("---")
#### Calculate the frequencies ####
pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)
for it in xrange(maxit):
for pop in xrange(maxpop):
for locus in xrange(maxlocus):
for site in xrange(maxsite):
x = []
for ind in xrange(maxind):
x.append(all[it, pop, ind, locus, site])
pA[it, pop, site] = x.count("A") / maxind
pT[it, pop, site] = x.count("T") / maxind
pC[it, pop, site] = x.count("C") / maxind
pG[it, pop, site] = x.count("G") / maxind
print(pA)
print(pC)
print(pG)
print(pT)from __future__ import division
import os
import re
import sys
import numpy
from Bio import SeqIO
def parse_id(string_id):
return tuple(map(int, re.findall(r'\d+', string_id)))
def main():
filename = sys.argv[1]
data = list(SeqIO.parse(filename, "fasta"))
header_keys = numpy.array([parse_id(datum.id) for datum in data])
maxit, maxpop, maxind, maxlocus = header_keys.max(axis=0) + 1
maxsite = len(data[0].seq)
all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")
for header_key, sequence in zip(header_keys, data):
all[tuple(header_key)] = sequence
#### Calculate the frequencies ####
pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)
for locus in xrange(maxlocus):
for site in xrange(maxsite):
x = all[..., locus, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)
print(pA)
print(pC)
print(pG)
print(pT)
main()for site in xrange(maxsite):for locus in xrange(maxlocus):
x = all[..., locus, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)x = all[..., -1, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)Context
StackExchange Code Review Q#95261, answer score: 5
Revisions (0)
No revisions yet.