from time import time class BinTree: def __init__(self, key, left, right): """ Init Tree """ self.key = key self.left = left self.right = right class Heap: def __init__(self): """Init heap.""" self.elts = [None] def isempty(self): """Check whether heap is empty. Returns: bool: True if heap is empty, False otherwise. """ return len(self.elts) == 1 def push(self, x): """Add an element to the heap. Args: x (value, elt): pair to enqueue. Returns: Heap: The updated heap. """ self.elts.append(x) i = len(self.elts)-1 while (i > 1) and x[0] < self.elts[i//2][0]: (self.elts[i], self.elts[i//2]) = (self.elts[i//2], self.elts[i]) i = i // 2 return self def pop(self): """Remove and return first element from the heap. Returns: (num, any): Element from the queue. Raises: IndexError: If heap is empty. """ e = self.elts[1] self.elts[1] = self.elts[len(self.elts)-1] self.elts.pop() n = len(self.elts)-1 ok = False i = 1 while (i <= n // 2) and not ok: j = 2 * i if (j + 1 <= n) and (self.elts[j+1][0] < self.elts[j][0]): j = j + 1 if self.elts[i][0] > self.elts[j][0]: (self.elts[i], self.elts[j]) = (self.elts[j], self.elts[i]) i = j else: ok = True return e ############################################################################### # Do not change anything above this line, except your login! # Do not add any import ############################################################################### ## COMPRESSION def buildfrequencylist(dataIN): """ Builds a tuple list of the character frequencies in the input. """ # Check for NoneType and empty input if not dataIN: return [] L = [] l = 0 # List length for c in dataIN: i = 0 elt = None # Define elt before using it while i < l and c != elt: (val, elt) = L[i] if c == elt: L[i] = (val + 1, elt) # Increase indexed character count else: i += 1 # Check next index if i >= l: L.append((1, c)) # Add new character l += 1 # Increase list length value return L def buildHuffmantree(inputList): """ Processes the frequency list into a Huffman tree according to the algorithm. """ # Check for NoneType and empty list if not inputList: return None H = Heap() # Heap of valued BinTrees for (val, elt) in inputList: H.push((val, BinTree(elt, None, None))) # Merge BinTrees while there are multiple while len(H.elts) > 2: # Pop the two smallest (smallerVal, smallerBinTree) = H.pop() (smallVal, smallBinTree) = H.pop() # Push merged BinTree H.push((smallVal + smallerVal, BinTree(None, smallBinTree, smallerBinTree))) # Check for empty heap if H.isempty(): return None # Return fully merged BinTree return H.pop()[1] def encodedata(huffmanTree, dataIN): """ Encodes the input string to its binary string representation. """ # Check for NoneType if not huffmanTree: raise Exception("Empty BinTree") # Check for NoneType and empty string if not dataIN: return "" stack = [(huffmanTree, "")] # (BinTree, path) list L = [] # Path list: (character, path) list while len(stack) > 0: (B, path) = stack.pop() if B.key: L.append((B.key, path)) # x has been found if B.left: stack.append((B.left, path + "0")) if B.right: stack.append((B.right, path + "1")) fullpath = "" l = len(L) # Path list length for c in dataIN: # Search for character path path = None i = 0 while i < l and not path: if L[i][0] == c: path = L[i][1] i += 1 # Verify path if not path: raise Exception("Path to '" + x + "' not found") fullpath += path return fullpath def encodetree(huffmanTree): """ Encodes a huffman tree to its binary representation using a preOrder traversal: * each leaf key is encoded into its binary representation on 8 bits preceded by '1' * each time we go left we add a '0' to the result """ # Check for NoneType if not huffmanTree: return "" def prefixEncodeTree(B, L): if not B: return if not B.left and not B.right: L.append(B.key); return L.append(None) prefixEncodeTree(B.left, L) prefixEncodeTree(B.right, L) # Self-made Stack (basically) L = [] prefixEncodeTree(huffmanTree, L) b = "" for e in L: if not e: b += "0" else: b += "1" + __dec2bin(ord(e)) return b def it_encodetree(huffmanTree): """ Encodes a huffman tree to its binary representation using a preOrder traversal: * each leaf key is encoded into its binary representation on 8 bits preceded by '1' * each time we go left we add a '0' to the result """ # Check for NoneType if not huffmanTree: return "" b = "" stack = [] B = huffmanTree l = 0 # List length while B or l > 0: if not B: B = stack.pop() l -= 1 if B.key: b += "1" + __dec2bin(ord(B.key)) else: b += "0" if B.right: stack.append(B.right) l += 1 B = B.left return b def tobinary(dataIN): """ Compresses a string containing binary code to its real binary value. """ i = 0 li = len(dataIN) s = "" rs = "" while i < li: if i % 8 == 0 and s: rs += chr(__bin2dec(s)) s = "" s += dataIN[i] i += 1 align = 8 - len(s) for i in range(align): s = "0" + s rs += chr(__bin2dec(s)) return rs, align def compress(dataIn): """ The main function that makes the whole compression process. """ # Build Huffman tree L = buildfrequencylist(dataIn) H = buildHuffmantree(L) # Encode data & Huffman tree data = encodedata(H, dataIn) tree = encodetree(H) return tobinary(data), tobinary(tree) ################################################################################ ## DECOMPRESSION def decodedata(huffmanTree, dataIN): """ Decode a string using the corresponding huffman tree into something more readable. """ # Check for NoneType if not huffmanTree: raise Exception("Empty Huffman tree") # Check for empty string if not dataIN: return "" def decodecharacterat(H, i): if not H: raise Exception("Path not found in Huffman tree") if H.key: return H.key, i li = len(dataIN) if i >= li: raise Exception("Incomplete path to character") if dataIN[i] == '0': return decodecharacterat(H.left, i + 1) if dataIN[i] == '1': return decodecharacterat(H.right, i + 1) raise Exception("Input string should only contain '0' and '1' characters") i = 0 li = len(dataIN) rs = "" while i < li: (s, i) = decodecharacterat(huffmanTree, i) rs += s return rs def decodetree(dataIN): """ Decodes a huffman tree from its binary representation: * a '0' means we add a new internal node and go to its left node * a '1' means the next 8 values are the encoded character of the current leaf """ # Check for empty string if not dataIN: return "" L = [] i = 0 li = len(dataIN) rs = "" while i < li: if dataIN[i] == '0': L.append(None) elif dataIN[i] == '1': # Check for "1########" (9 characters-long) if li - i < 9: raise Exception("Syntax error") s = "" for i in range(i + 1, i + 9): s += dataIN[i] L.append(chr(__bin2dec(s))) else: raise Exception("Input string should only contain '0' and '1' characters") i += 1 li = len(L) def buildTree(L, i=0): if i >= li: return None, li B = BinTree(None, None, None) if L[i]: B.key = L[i] else: (B.left, i) = buildTree(L, i+1) (B.right, i) = buildTree(L, i+1) return B, i return buildTree(L)[0] def frombinary(dataIN, align): """ Retrieve a string containing binary code from its real binary value (inverse of :func:`toBinary`). """ # Check for empty input data if not dataIN: return "" binarystr = "" lDataIN = len(dataIN) - 1 for i in range(0, lDataIN): binarystr += __dec2bin(ord(dataIN[i])) lastbinary = __dec2bin(ord(dataIN[lDataIN])) for j in range(align, len(lastbinary)): binarystr += lastbinary[j] return binarystr def decompress(data, dataAlign, tree, treeAlign): """ The whole decompression process. """ # Decompress Huffman tree & data enTree = frombinary(tree, treeAlign) enData = frombinary(data, dataAlign) # Decode Huffman tree & data deTree = decodetree(enTree) deData = decodedata(deTree, enData) # Return decoded data return deData ################################################################################ ## ADDITIONAL FUNCTIONS def __dec2bin(x): n = "" while x >= 1: n = ("0" if x % 2 == 0 else "1") + n x //= 2 # Force output string length to 8 (fills with "0") l = len(n) while l < 8: n = "0" + n l += 1 return n def __bin2dec(x): x = int(x) if x < 0: raise Exception("Invalid binary") n = 0 w = 1 while x > 0: mod = x % 10 if mod == 1: n += w elif mod > 1: raise Exception("Not binary") # else (mod == 0): n += 0 x //= 10 w *= 2 return n original = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque elementum quam diam, nec pharetra velit gravida ac. Quisque molestie efficitur nisl, auctor congue urna tempus in. Nullam risus felis, sollicitudin sit amet magna ultrices, consectetur cursus nisi. Mauris luctus leo dui, in rutrum mauris sodales non. Morbi laoreet purus et nulla elementum, et fermentum purus posuere. Etiam id porttitor odio. Mauris porttitor enim eu justo cursus, ac efficitur lacus pretium. Nulla eu enim quis metus fermentum suscipit. Etiam vel est in odio suscipit pretium. Donec gravida libero urna, vitae gravida massa aliquam fermentum. Nam orci ante, varius non purus eu, convallis tempus sem.\ Aenean euismod accumsan nunc, ac tincidunt odio interdum sit amet. Aliquam sit amet metus sem. Maecenas a vehicula ex, eu congue risus. Proin laoreet auctor porttitor. Interdum et malesuada fames ac ante ipsum primis in faucibus. Aenean ut vulputate lacus, id condimentum magna. Maecenas ultricies nec velit et amet." print(original) a = time() compressed = compress(original) print(time() - a) b = time() uncompressed = decompress(compressed[0][0], compressed[0][1], compressed[1][0], compressed[1][1]) print(time() - b) print(uncompressed)
We use cookies to provide and improve our services. By using our site, you consent to our Cookies Policy. Accept Learn more