HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

Fast row-wise, parallel calculations on large, sparse matrix

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
fastmatrixwisecalculationslargeparallelrowsparse

Problem

I have a big (50k*150k) matrix with co-occurrences of nouns (rows) and adjectives (columns). As most nouns don't co-occurr with most adjectives, this matrix is very sparse, >99.9%, so I'm using the CSR format. For every pair of nouns I need to calculate the Jason-Shannon Divergence to asses how similar the co-occurrence distributions with all adjectives are between each pair. I have sixteen cores I can use, so I run the calculations in parallel, such that each core processes every sixteenth noun.

It's not going very fast and I'm wondering whether there's a faster way to do this.

```
from __future__ import division
from multiprocessing import Pool
from scipy.stats import entropy
import codecs
import os
import scipy
import scipy.sparse

# --- data & parameters--- #
frequenciesAdjectives = codecs.open('frequencies.txt', 'r', 'utf-8')
results = codecs.open('results.txt', 'w', 'utf-8')
temporaryFilesPath = "/temp/"
cores = 16

# --- functions --- #

# calculate jensen shannon divergence
def JSD(p, q):
p = p
q = q
m = 0.5 * (p+q)
jsd = 0.5 * (entropy(p, m) + entropy(q, m))
return jsd

# calculate JSD for every i-th noun with every other noun and write to temp file, where i is the number of cores used, and return filepath
def getJSDs(n):
# open temporary file
filename = "temp"+str(n).zfill(2)
fullpath = temporaryFilesPath+filename
temp = codecs.open(fullpath, "w", "utf-8")

# shortcut write function
tempwrite = temp.write

# calculate JSD for each noun pair and write to temporary file
for index, noun1 in enumerate(nouns[n::cores]):
index = index*cores+n
first = sparseMatrix.getrow(index).toarray()[0]
tempwrite("here")
for index2, noun2 in enumerate(nouns[index:]):
index2 += index
second = sparseMatrix.getrow(index2).toarray()[0]
divergence = JSD(first, second)
tempwrite(u"{noun1}\t{noun2}\t{divergence}\n".format(**locals()))

temp.close(

Solution

I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster.

Key changes include:

  • Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely



  • Using math.log instead of scipy.log (huge difference!)



  • Switching from Python 2 to Python 3



  • Not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.



#!/usr/bin/env/python3

from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit

# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16

# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
    noun1, noun2, p, q = nounTuple
    jsd = 0.0
    m = p + q
    for key in m:
        m_key = 0.5 * m[key]
        if key in p:
            p_key = p[key]
            jsd += 0.5 * p_key * log2(p_key/m_key)
        if key in q:
            q_key = q[key]
            jsd += 0.5 * q_key * log2(q_key/m_key)
    return noun1, noun2, jsd

def jobGenerator(tuples):
    for index, (noun, adjectives) in enumerate(tuples):
        for noun2, adjectives2 in tuples[index:]:
            yield noun, noun2, adjectives, adjectives2

# --- processing --- #
# ignore header
frequenciesAdjectives.readline()

# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
    adjectives = Counter()
    line = line.strip().lower().split("\t")
    noun = line[0]
    adjectiveList = [pair.split(" ") for pair in line[2:]]
    frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
    for adjective, frequency in adjectiveList:
        probability = int(frequency)/frequencySum
        adjectives[adjective] = probability
    nounAdjectives.append((noun, adjectives))

# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)

# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")

# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
    resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

Code Snippets

#!/usr/bin/env/python3

from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit


# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16


# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
    noun1, noun2, p, q = nounTuple
    jsd = 0.0
    m = p + q
    for key in m:
        m_key = 0.5 * m[key]
        if key in p:
            p_key = p[key]
            jsd += 0.5 * p_key * log2(p_key/m_key)
        if key in q:
            q_key = q[key]
            jsd += 0.5 * q_key * log2(q_key/m_key)
    return noun1, noun2, jsd


def jobGenerator(tuples):
    for index, (noun, adjectives) in enumerate(tuples):
        for noun2, adjectives2 in tuples[index:]:
            yield noun, noun2, adjectives, adjectives2


# --- processing --- #
# ignore header
frequenciesAdjectives.readline()

# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
    adjectives = Counter()
    line = line.strip().lower().split("\t")
    noun = line[0]
    adjectiveList = [pair.split(" ") for pair in line[2:]]
    frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
    for adjective, frequency in adjectiveList:
        probability = int(frequency)/frequencySum
        adjectives[adjective] = probability
    nounAdjectives.append((noun, adjectives))

# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)

# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")

# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
    resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

Context

StackExchange Code Review Q#94785, answer score: 2

Revisions (0)

No revisions yet.