patternpythonMinor
Sparse Bitarray Class in Python (or rather frequently having contiguous elements with the same value)
Viewed 0 times
sametheclassbitarrayelementswithhavingvalueratherfrequently
Problem
Sparse is probably the wrong word - this is for encoding arrays of booleans where contiguous values tend to be the same. It'd be great to know a proper name for this data structure so I could read about it or use an existing implementation. I already realize I probably ought not to use doctest, and would welcome feedback on this as well. github
```
"""
Sparse (really Frequently Contiguous) Binary Array
TODO:
bitwise and would be really useful
generalize to more than two possible values
binary search to find overlapping segments
"""
class SBA(object):
"""Sparse BitArray, in which data is represented by ranges
>>> s = SBA(20); s
"""
def __init__(self, length):
self.length = length
self.set_ranges = []
def __len__(self):
return self.length
def __repr__(self):
s = ''
elif self.set_ranges == [(0, self.length)]:
s += 'all bits set>'
else:
s += str(self.set_ranges)+'>'
return s
def _find_overlapping_ranges(self, start, end):
"""Returns existing ranges that overlap or touch a new one
Returns a tuple of
* ranges entirely contained by the new start and end
* ranges over or at the edge of the new start and end
* the range, if any, that entirely contains the new start and end
"""
#TODO use binary search instead here
edge_overlapping = []
contained_by_new = []
contains_new = None
for r_start, r_end in self.set_ranges:
if r_end end:
pass
elif r_start = end:
contains_new = (r_start, r_end)
break
elif r_start >= start and r_end >> s = SBA(20); s[2:6] = True; s
>>> s[2:6]
>>> s[4:10]
>>> s[9:14] = True; s[17:19] = True; s
>>> s[2:18]
>>> s[1], s[10]
(False, True)
"""
if isinstance(key
```
"""
Sparse (really Frequently Contiguous) Binary Array
TODO:
bitwise and would be really useful
generalize to more than two possible values
binary search to find overlapping segments
"""
class SBA(object):
"""Sparse BitArray, in which data is represented by ranges
>>> s = SBA(20); s
"""
def __init__(self, length):
self.length = length
self.set_ranges = []
def __len__(self):
return self.length
def __repr__(self):
s = ''
elif self.set_ranges == [(0, self.length)]:
s += 'all bits set>'
else:
s += str(self.set_ranges)+'>'
return s
def _find_overlapping_ranges(self, start, end):
"""Returns existing ranges that overlap or touch a new one
Returns a tuple of
* ranges entirely contained by the new start and end
* ranges over or at the edge of the new start and end
* the range, if any, that entirely contains the new start and end
"""
#TODO use binary search instead here
edge_overlapping = []
contained_by_new = []
contains_new = None
for r_start, r_end in self.set_ranges:
if r_end end:
pass
elif r_start = end:
contains_new = (r_start, r_end)
break
elif r_start >= start and r_end >> s = SBA(20); s[2:6] = True; s
>>> s[2:6]
>>> s[4:10]
>>> s[9:14] = True; s[17:19] = True; s
>>> s[2:18]
>>> s[1], s[10]
(False, True)
"""
if isinstance(key
Solution
- Your
__repr__function tells the outside world about the implementation of your class. The fact that you are using a particular internal representation of a string shouldn't be exposed even here. I suggest this function should return something of the formSparseBitArray('1111')instead.
_find_overlapping_rangesdoesn't seem very useful. Every time it is used, there some tricky to follow logic regarding it afterwards.
- The logic for decoding the slice is duplicated in
__getitem__and__setitem__, refactor it into a function
__getitem__doesn't raise IndexError for out of bounds indexes. This would make it iterable.
- Your internal data structure isn't actually the best choice. You store data like [(4, 7), (10, 25)]. Each tuple indicates a range of 1 values. A better approach would actually be to hold [4, 8, 10, 25], where each value indicates a point at which the bitstring changes from 1 to 0 or vice versa.
My reworking of your code:
import bisect
import sys
class SBA(object):
"""Sparse BitArray, in which data is represented by ranges
>>> s = SBA(length = 20); s
SparseBitArray('00000000000000000000')
"""
def __init__(self, length):
self.length = length
self.set_ranges = []
self.changes = []
def __len__(self):
return self.length
def __repr__(self):
text = ''.join(str(int(x)) for x in self)
return "SparseBitArray('%s')" % text
def _decode_slice(self, key):
start, end = key.start, key.stop
if start is None: start = 0
if end is None: end = len(self)
if key.step not in [None, 1]: raise ValueError("Custom steps not allowed: "+repr(key))
return start, end
def __getitem__(self, key):
"""Get a slice or the value of an entry
>>> s = SBA(20); s[2:6] = True; s
SparseBitArray('00111100000000000000')
>>> s[2:6]
SparseBitArray('1111')
>>> s[4:10]
SparseBitArray('110000')
>>> s[9:14] = True; s[17:19] = True; s
SparseBitArray('00111100011111000110')
>>> s[2:18]
SparseBitArray('1111000111110001')
>>> s[1], s[10]
(False, True)
"""
if isinstance(key, slice):
start, end = self._decode_slice(key)
start_index, end_index = self._indexes(start, end)
result = SBA(end - start)
if self[start]:
result.changes.append(0)
result.changes.extend(
change - start for change in self.changes[start_index:end_index] )
return result
else:
if key >= len(self):
raise IndexError(key)
return bool(bisect.bisect_right(self.changes, key) % 2)
def _indexes(self, start, end):
start_index = bisect.bisect_right(self.changes, start)
end_index = bisect.bisect_right(self.changes, end)
return start_index, end_index
def __setitem__(self, key, value):
"""Sets item or slice to True or False
>>> s = SBA(20); s[2:6] = True; s
SparseBitArray('00111100000000000000')
>>> s[4:10] = True; s
SparseBitArray('00111111110000000000')
>>> s[3:11] = False; s
SparseBitArray('00100000000000000000')
>>> s[4:14] = True; s
SparseBitArray('00101111111111000000')
>>> s[8:11] = False; s
SparseBitArray('00101111000111000000')
>>> s[5:7] = True; s
SparseBitArray('00101111000111000000')
>>> s[15:18] = False; s
SparseBitArray('00101111000111000000')
>>> s[14:16] = False; s
SparseBitArray('00101111000111000000')
>>> s[14:16] = True; s
SparseBitArray('00101111000111110000')
>>> s[4:] = True; s
SparseBitArray('00101111111111111111')
>>> s[:10] = False; s
SparseBitArray('00000000001111111111')
>>> s[:] = False; s
SparseBitArray('00000000000000000000')
>>> s[:] = True; s
SparseBitArray('11111111111111111111')
"""
if isinstance(key, slice):
start, end = self._decode_slice(key)
start_index, end_index = self._indexes(start, end)
new_changes = self.changes[:start_index]
if bool(len(new_changes) % 2) != value:
new_changes.append(start)
if bool(end_index % 2) != value:
new_changes.append(end)
new_changes.extend( self.changes[end_index:] )
self.changes = new_changes
else:
raise ValueError("Single element assignment not allowed")
if __name__ == '__main__':
import doctest
print doctest.testmod()Code Snippets
import bisect
import sys
class SBA(object):
"""Sparse BitArray, in which data is represented by ranges
>>> s = SBA(length = 20); s
SparseBitArray('00000000000000000000')
"""
def __init__(self, length):
self.length = length
self.set_ranges = []
self.changes = []
def __len__(self):
return self.length
def __repr__(self):
text = ''.join(str(int(x)) for x in self)
return "SparseBitArray('%s')" % text
def _decode_slice(self, key):
start, end = key.start, key.stop
if start is None: start = 0
if end is None: end = len(self)
if key.step not in [None, 1]: raise ValueError("Custom steps not allowed: "+repr(key))
return start, end
def __getitem__(self, key):
"""Get a slice or the value of an entry
>>> s = SBA(20); s[2:6] = True; s
SparseBitArray('00111100000000000000')
>>> s[2:6]
SparseBitArray('1111')
>>> s[4:10]
SparseBitArray('110000')
>>> s[9:14] = True; s[17:19] = True; s
SparseBitArray('00111100011111000110')
>>> s[2:18]
SparseBitArray('1111000111110001')
>>> s[1], s[10]
(False, True)
"""
if isinstance(key, slice):
start, end = self._decode_slice(key)
start_index, end_index = self._indexes(start, end)
result = SBA(end - start)
if self[start]:
result.changes.append(0)
result.changes.extend(
change - start for change in self.changes[start_index:end_index] )
return result
else:
if key >= len(self):
raise IndexError(key)
return bool(bisect.bisect_right(self.changes, key) % 2)
def _indexes(self, start, end):
start_index = bisect.bisect_right(self.changes, start)
end_index = bisect.bisect_right(self.changes, end)
return start_index, end_index
def __setitem__(self, key, value):
"""Sets item or slice to True or False
>>> s = SBA(20); s[2:6] = True; s
SparseBitArray('00111100000000000000')
>>> s[4:10] = True; s
SparseBitArray('00111111110000000000')
>>> s[3:11] = False; s
SparseBitArray('00100000000000000000')
>>> s[4:14] = True; s
SparseBitArray('00101111111111000000')
>>> s[8:11] = False; s
SparseBitArray('00101111000111000000')
>>> s[5:7] = True; s
SparseBitArray('00101111000111000000')
>>> s[15:18] = False; s
SparseBitArray('00101111000111000000')
>>> s[14:16] = False; s
SparseBitArray('00101111000111000000')
>>> s[14:16] = True; s
SparseBitArray('00101111000111110000')
>>> s[4:] = True; s
SparseBitArray('00101111111111111111')
>>> s[:10] = False; s
SparseBitArray('00000000001111111111')
>>> s[:] = False; s
SparseBitArray('00Context
StackExchange Code Review Q#17753, answer score: 3
Revisions (0)
No revisions yet.