HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

Find allele frequencies at each site for each iteration for each population from FASTA file

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
frequenciesfastafileeachiterationallelesiteforfindpopulation

Problem

The script takes a FASTA format file in input and outputs the frequencies of each amino acid (A, C, T and G) at each site per population per iteration. The name of each sequence looks like >it0pop1ind0locus0. In this example, this indicates that the sequence that follows is the DNA sequence for the zeroth iteration (it), the first population (pop), the zeroth individual (ind) at the zeroth locus (locus).

My goal (just to restate it) is to get the frequencies of each amino acid (A, C, T and G) at each site per population per iteration.

```
from __future__ import division
import os
from Bio import SeqIO
import itertools
import re
import sys
os.chdir("/path/to/directory/")

#### ReFormat the data ####

maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse("0.0.1.fasta", "fasta")):
if pos == 0:
maxsite = len(i.seq)
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
maxit = max(v[0]+1, maxit)
maxpop = max(v[1]+1, maxpop)
maxind = max(v[2]+1, maxind)
maxlocus = max(v[3]+1, maxlocus)

all = [[[[ "x" * maxsite for locus in xrange(maxlocus)] for ind in xrange(maxind)] for pop in xrange(maxpop)] for it in xrange(maxit)]

for i in SeqIO.parse("0.0.1.fasta", "fasta"):
v1 = re.findall(r'\d+', i.id)
v = []
for val in v1:
v.append(int(val))
it = v[0]
pop = v[1]
ind = v[2]
locus = v[3]
all[it][pop][ind][locus] = str(i.seq)

#### Calculate the frequencies ####

pA = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pC = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pG = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]
pT = [[[2 for it in xrange(maxsite)] for pop in xrange(maxpop)] for site in xrange(maxit)]

for it in xrange(maxit):
for pop in xrange(maxpop):
for locus in xrange(maxlocus):

Solution

let me choose

First things first; your code is chdiring to /path/to/directory/ and opening 0.0.1.fasta. This is woefully inflexible.

The simplest fix is to accept an argument on the command line, as sys.argv[1]. (Don't use chdir.)

numpy

What you're doing looks a lil' bit crazy. To deal with highly-dimensional data, Numpy is often a good candidate.

A first-pass would look something like

from __future__ import division

import os
from Bio import SeqIO
import itertools
import re
import sys
import numpy

#### ReFormat the data ####

filename = sys.argv[1]

maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse(filename, "fasta")):
    print(i.seq)

    if pos == 0:
        maxsite = len(i.seq)

    it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]

    maxit = max(it+1, maxit)
    maxpop = max(pop+1, maxpop)
    maxind = max(ind+1, maxind)
    maxlocus = max(locus+1, maxlocus)

all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")

for i in SeqIO.parse(filename, "fasta"):
    it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
    all[it, pop, ind, locus] = i.seq

print(all)
print("---")
#### Calculate the frequencies ####

pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)

for it in xrange(maxit):
    for pop in xrange(maxpop):
        for locus in xrange(maxlocus):
            for site in xrange(maxsite):
                x = []
                for ind in xrange(maxind):
                    x.append(all[it, pop, ind, locus, site])
                pA[it, pop, site] = x.count("A") / maxind
                pT[it, pop, site] = x.count("T") / maxind
                pC[it, pop, site] = x.count("C") / maxind
                pG[it, pop, site] = x.count("G") / maxind

print(pA)
print(pC)
print(pG)
print(pT)


Then one can use Numpy tools like fancy slicing

from __future__ import division

import os
import re
import sys

import numpy

from Bio import SeqIO

def parse_id(string_id):
    return tuple(map(int, re.findall(r'\d+', string_id)))

def main():
    filename = sys.argv[1]

    data = list(SeqIO.parse(filename, "fasta"))
    header_keys = numpy.array([parse_id(datum.id) for datum in data])

    maxit, maxpop, maxind, maxlocus = header_keys.max(axis=0) + 1
    maxsite = len(data[0].seq)

    all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")

    for header_key, sequence in zip(header_keys, data):
        all[tuple(header_key)] = sequence

    #### Calculate the frequencies ####
    pA = numpy.full((maxit, maxpop, maxsite), 2)
    pC = numpy.full((maxit, maxpop, maxsite), 2)
    pG = numpy.full((maxit, maxpop, maxsite), 2)
    pT = numpy.full((maxit, maxpop, maxsite), 2)

    for locus in xrange(maxlocus):
        for site in xrange(maxsite):
            x = all[..., locus, :]
            pA[...] = (x == "A").mean(axis=-2)
            pT[...] = (x == "T").mean(axis=-2)
            pC[...] = (x == "C").mean(axis=-2)
            pG[...] = (x == "G").mean(axis=-2)

    print(pA)
    print(pC)
    print(pG)
    print(pT)

main()


Note that

for site in xrange(maxsite):


is pointless as you don't use site and the effect is idempotent. Note then that

for locus in xrange(maxlocus):
    x = all[..., locus, :]
    pA[...] = (x == "A").mean(axis=-2)
    pT[...] = (x == "T").mean(axis=-2)
    pC[...] = (x == "C").mean(axis=-2)
    pG[...] = (x == "G").mean(axis=-2)


is just

x = all[..., -1, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)


Then one can skip the pre-initialization (pA = numpy.full(...)) and just do

x = all[..., -1, :]
pA = (x == "A").mean(axis=-2)
pT = (x == "T").mean(axis=-2)
pC = (x == "C").mean(axis=-2)
pG = (x == "G").mean(axis=-2)


This allows removing even more of the code.

intermission

Don't use all as a variable name; it clashes with the built-in (same for id).

finishing up

I feel this is a bit awkward still, but it's nicer than before.

```
from __future__ import division

import os
import re
import sys

import numpy

from Bio import SeqIO

def parse_id(string_id):
return tuple(map(int, re.findall(r'\d+', string_id)))[:3]

def main():
filename = sys.argv[1]

data = list(SeqIO.parse(filename, "fasta"))
header_keys = numpy.array([parse_id(datum.id) for datum in data])

maxit, maxpop, maxind = header_keys.max(axis=0) + 1
maxsite = len(data[0].seq)

sequences = numpy.full((maxit, maxpop, maxind, maxsite), '\0', dtype="S")
for header_key, sequence in zip(header_keys, data):
sequences[tuple(header_key)] = sequence

# Calculate frequencies
pA = (sequences == "A").mean(axis=-2)
pT = (sequences == "T").mean(axis=-2)
pC = (sequences == "C").mean(axis=-2)

Code Snippets

from __future__ import division

import os
from Bio import SeqIO
import itertools
import re
import sys
import numpy

#### ReFormat the data ####

filename = sys.argv[1]

maxit = 1
maxpop = 1
maxind = 1
maxlocus = 1
for pos, i in enumerate(SeqIO.parse(filename, "fasta")):
    print(i.seq)

    if pos == 0:
        maxsite = len(i.seq)

    it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]

    maxit = max(it+1, maxit)
    maxpop = max(pop+1, maxpop)
    maxind = max(ind+1, maxind)
    maxlocus = max(locus+1, maxlocus)

all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")

for i in SeqIO.parse(filename, "fasta"):
    it, pop, ind, locus = [int(d) for d in re.findall(r'\d+', i.id)]
    all[it, pop, ind, locus] = i.seq

print(all)
print("---")
#### Calculate the frequencies ####

pA = numpy.full((maxit, maxpop, maxsite), 2)
pC = numpy.full((maxit, maxpop, maxsite), 2)
pG = numpy.full((maxit, maxpop, maxsite), 2)
pT = numpy.full((maxit, maxpop, maxsite), 2)

for it in xrange(maxit):
    for pop in xrange(maxpop):
        for locus in xrange(maxlocus):
            for site in xrange(maxsite):
                x = []
                for ind in xrange(maxind):
                    x.append(all[it, pop, ind, locus, site])
                pA[it, pop, site] = x.count("A") / maxind
                pT[it, pop, site] = x.count("T") / maxind
                pC[it, pop, site] = x.count("C") / maxind
                pG[it, pop, site] = x.count("G") / maxind

print(pA)
print(pC)
print(pG)
print(pT)
from __future__ import division

import os
import re
import sys

import numpy

from Bio import SeqIO


def parse_id(string_id):
    return tuple(map(int, re.findall(r'\d+', string_id)))

def main():
    filename = sys.argv[1]

    data = list(SeqIO.parse(filename, "fasta"))
    header_keys = numpy.array([parse_id(datum.id) for datum in data])

    maxit, maxpop, maxind, maxlocus = header_keys.max(axis=0) + 1
    maxsite = len(data[0].seq)

    all = numpy.full((maxit, maxpop, maxind, maxlocus, maxsite), '\0', dtype="S")

    for header_key, sequence in zip(header_keys, data):
        all[tuple(header_key)] = sequence


    #### Calculate the frequencies ####
    pA = numpy.full((maxit, maxpop, maxsite), 2)
    pC = numpy.full((maxit, maxpop, maxsite), 2)
    pG = numpy.full((maxit, maxpop, maxsite), 2)
    pT = numpy.full((maxit, maxpop, maxsite), 2)

    for locus in xrange(maxlocus):
        for site in xrange(maxsite):
            x = all[..., locus, :]
            pA[...] = (x == "A").mean(axis=-2)
            pT[...] = (x == "T").mean(axis=-2)
            pC[...] = (x == "C").mean(axis=-2)
            pG[...] = (x == "G").mean(axis=-2)

    print(pA)
    print(pC)
    print(pG)
    print(pT)

main()
for site in xrange(maxsite):
for locus in xrange(maxlocus):
    x = all[..., locus, :]
    pA[...] = (x == "A").mean(axis=-2)
    pT[...] = (x == "T").mean(axis=-2)
    pC[...] = (x == "C").mean(axis=-2)
    pG[...] = (x == "G").mean(axis=-2)
x = all[..., -1, :]
pA[...] = (x == "A").mean(axis=-2)
pT[...] = (x == "T").mean(axis=-2)
pC[...] = (x == "C").mean(axis=-2)
pG[...] = (x == "G").mean(axis=-2)

Context

StackExchange Code Review Q#95261, answer score: 5

Revisions (0)

No revisions yet.