patternpythonMinor
Apriori algorithm in Python 2
Viewed 0 times
algorithmaprioripython
Problem
This takes in a dataset, the minimum support and the minimum confidence values as its options, and returns the association rules.
I'm looking for pointers towards better optimization, documentation and code quality.
```
"""
Description : A Python implementation of the Apriori Algorithm
Usage:
$python apriori.py -f DATASET.csv -s minSupport -c minConfidence
$python apriori.py -f DATASET.csv -s 0.15 -c 0.6
"""
import sys
from itertools import chain, combinations
from collections import defaultdict
from optparse import OptionParser
def subsets(arr):
"""
Returns non empty subsets of arr
enumerate(arr) , "
combinations(arr, i) = minSupport:
_itemSet.add(item)
return _itemSet
def joinSet(itemSet, length):
"""Join a set with itself and returns the n-element itemsets"""
return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])
def getItemSetTransactionList(data_iterator):
"""
Takes data from dataFromFile() and returns list of items and a list of transactions
and generate two seperate sets of items and transactions.
The item list would be:
([frozenset(['apple']), frozenset(['beer']), frozenset(['chicken']), etc
The transaction list would be:
frozenset(['beer', 'rice', 'apple', 'chicken']), frozenset(['beer', 'rice', 'apple']), etc
"""
transactionList = list()
itemSet = set()
for record in data_iterator:
transaction = frozenset(record)
transactionList.append(transaction)
for item in transaction:
itemSet.add(frozenset([item])) # Generate 1-itemSets
return itemSet, transactionList
def runApriori(data_iter, minSupport, minConfidence):
"""
run the apriori algorithm. data_iter is a record iterator
Return both:
- items (tuple, support)
- rules ((pretuple, posttuple), confidence)
"""
itemSet, transactionList = getItemSetTransac
I'm looking for pointers towards better optimization, documentation and code quality.
```
"""
Description : A Python implementation of the Apriori Algorithm
Usage:
$python apriori.py -f DATASET.csv -s minSupport -c minConfidence
$python apriori.py -f DATASET.csv -s 0.15 -c 0.6
"""
import sys
from itertools import chain, combinations
from collections import defaultdict
from optparse import OptionParser
def subsets(arr):
"""
Returns non empty subsets of arr
enumerate(arr) , "
combinations(arr, i) = minSupport:
_itemSet.add(item)
return _itemSet
def joinSet(itemSet, length):
"""Join a set with itself and returns the n-element itemsets"""
return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])
def getItemSetTransactionList(data_iterator):
"""
Takes data from dataFromFile() and returns list of items and a list of transactions
and generate two seperate sets of items and transactions.
The item list would be:
([frozenset(['apple']), frozenset(['beer']), frozenset(['chicken']), etc
The transaction list would be:
frozenset(['beer', 'rice', 'apple', 'chicken']), frozenset(['beer', 'rice', 'apple']), etc
"""
transactionList = list()
itemSet = set()
for record in data_iterator:
transaction = frozenset(record)
transactionList.append(transaction)
for item in transaction:
itemSet.add(frozenset([item])) # Generate 1-itemSets
return itemSet, transactionList
def runApriori(data_iter, minSupport, minConfidence):
"""
run the apriori algorithm. data_iter is a record iterator
Return both:
- items (tuple, support)
- rules ((pretuple, posttuple), confidence)
"""
itemSet, transactionList = getItemSetTransac
Solution
My biggest piece of advice would be to replace
Could be replaced with
This should be a pretty big speed increase.
Also,
freqSet = defaultdict(int) with a Counter. Counters are a datatype designed to do exactly what you are doing with defaultdicts, and they have some specialized methods. for item in itemSet:
for transaction in transactionList:
if item.issubset(transaction):
freqSet[item] += 1Could be replaced with
freqSet.update(item for item in itemSet for transaction in TransactionList if item.issubset(transaction))This should be a pretty big speed increase.
Also,
set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length]) could be written using a set comprehension, which would lower memory usage, and increase speed.Code Snippets
for item in itemSet:
for transaction in transactionList:
if item.issubset(transaction):
freqSet[item] += 1freqSet.update(item for item in itemSet for transaction in TransactionList if item.issubset(transaction))Context
StackExchange Code Review Q#136169, answer score: 5
Revisions (0)
No revisions yet.