Huffman tree, Huffman encoding/decoding

Huffman tree

Basic introduction to Huffman trees

Illustration of Huffman tree construction steps

Create Huffman tree code implementation

""" Create Huffman tree """
classEleNode:
    """ Node class """
    def __init__(self, value: int):
        self.value = value
        self.left = None # Point to the left child node
        self.right = None # Point to the right child node

    def __str__(self):
        return f"Node [value={self.value}]"

    def pre_order(self):
        """Preorder traversal of a binary tree"""
        if self is None:
            return
        # Output the parent node first
        print(self, end=' => ')
        # If the left subtree is not empty, recurse the left subtree
        if self.left is not None:
            self.left.pre_order()
        # If the right subtree is not empty, recurse the right subtree
        if self.right is not None:
            self.right.pre_order()


classHuffmanTree:
    arr: list = None

    def __init__(self, arr):
        self.arr = arr

    def create_huffman_tree(self) -> EleNode:
        # Sort the arr list in ascending order
        self.arr.sort()
        # Traverse the array and construct each array element into a Node node
        # Add Node nodes to a new list
        node_list = []
        for i in self.arr:
            node_list.append(EleNode(i))

        # Loop through the following steps until there is only one binary tree left in the list. At this time, the binary tree is the Huffman tree.
        while len(node_list) > 1:
            # Take out the smallest two-class binary tree in the weight sequence (i.e., the first two elements in the list) to build a new binary tree
            left = node_list.pop(0)
            right = node_list.pop(0)
            #The weight of the root node of the new binary tree = the sum of the weights of the two nodes
            parent = EleNode(left.value + right.value)
            # Let the left and right nodes of the new binary tree point to the two smallest nodes respectively.
            parent.left = left
            parent.right = right
            #Insert the new binary tree into the specified position in the sequence based on the size of the root node
            i = 0
            n = len(node_list)
            while i < n:
                if node_list[i].value >= parent.value:
                    node_list.insert(i, parent) # Found the location where the root node is stored
                    break
                i+=1
            else:
                # The end of the loop indicates that the new binary tree has the largest weight, and there is no one larger than it in the node_list list.
                # So add it to the end of the list
                node_list.append(parent)

        # At this time, there is only one binary tree in the list, which is the Huffman tree required.
        # Return the root node of the Huffman tree
        return node_list[0]


huffman = HuffmanTree([13, 7, 8, 3, 29, 6, 1])
node = huffman.create_huffman_tree()
node.pre_order()

Huffman coding

Basic introduction

Analysis of Huffman coding principles

Example of Huffman coding

Idea analysis

Code

""" Huffman coding """
class CodeNode:
    def __init__(self, data: int, weight: int):
        self.data = data # Store the ASCII code value corresponding to the character
        self.weight = weight # stores the number of times a character appears in the string
        self.left = None
        self.right = None

    def __str__(self):
        return f"Node[data={chr(self.data) if self.data else None}, weight={self.weight}]"

    def pre_order(self):
        """Preorder traversal of a binary tree"""
        print(self)
        if self.left:
            self.left.pre_order()
        if self.right:
            self.right.pre_order()


classHuffmanCode:
    # Huffman coding table
    # Store the encoding corresponding to each character
    # key is the corresponding character, val is the encoding corresponding to the character
    huffman_code_tab = {}
    # Record the length of the last segment of the Huffman binary encoding string
    # That is to say, the Huffman binary string is divided into eight bits. When divided into the last one, the length does not need to be 8.
    #So use a variable to store the length of the last binary string, which will be used during decoding.
    last_char_len = 0

    def create_huffman_tree(self, s: str) -> CodeNode:
        """
        Construct Huffman coded binary tree
        :param s: string to encode
        :return:
        """
        # Traverse the string, count the number of occurrences of each character, and put the results into the dictionary
        kw = {}
        for ch in s:
            ascii_code = ord(ch)
            if kw.get(ascii_code): # If the character has appeared before, directly add 1 to its number
                kw[ascii_code] + = 1
            else: # If it has not appeared before, the number of occurrences is 1
                kw[ascii_code] = 1

        # Sort the dictionary according to the number of times characters appear
        kw = sorted(kw.items(), key=lambda kv: (kv[1], kv[0]))
        # Traverse the dictionary and construct each element into a Node node
        # Add Node nodes to a new list
        node_list = []
        for k, v in kw:
            # print(chr(k),'=', v, end=', ')
            node_list.append(CodeNode(k, v))

        # Loop through the following steps until there is only one binary tree left in the list. At this time, the binary tree is the Huffman tree.
        while len(node_list) > 1:
            # Take out the smallest two-class binary tree in the weight sequence (i.e., the first two elements in the list) to build a new binary tree
            left = node_list.pop(0)
            right = node_list.pop(0)
            #The weight of the root node of the new binary tree = the sum of the weights of the two nodes
            parent = CodeNode(None, left.weight + right.weight)
            # Let the left and right nodes of the new binary tree point to the two smallest nodes respectively.
            parent.left = left
            parent.right = right
            #Insert the new binary tree into the specified position in the sequence based on the size of the root node
            n = len(node_list)
            i = 0
            while i < n:
                if node_list[i].weight >= parent.weight:
                    node_list.insert(i, parent) # Found the location where the root node is stored
                    break
                i + = 1
            else:
                # The end of the loop indicates that the new binary tree has the largest weight, and there is no one larger than it in the node_list list.
                # So add it to the end of the list
                node_list.append(parent)

        # At this time, there is only one binary tree in the list, which is the Huffman tree required.
        # Return the root node of the Huffman tree
        return node_list[0]

    def get_huffman_code_tab(self, ele_node: CodeNode, code: str, code_str: str):
        """
        Traverse the created Huffman tree and obtain the codes of all leaf nodes (leaf nodes are the characters to be obtained)
        Here it is specified that the left node is 0 and the right node is 1
        :param ele_node: The root node of the tree to be traversed, initially the root node
        :param code: indicates whether the selected path is a left node or a right node
        :param code_str: the code corresponding to each character
        :return:
        """
        code_str + = code # splicing coding
        if ele_node.data is None:
            # represents a non-leaf node, because the data of the leaf node is set to be empty when creating the Huffman tree.
            # code_str + = code
            ifele_node.left:
                self.get_huffman_code_tab(ele_node.left, '0', code_str)
            ifele_node.right:
                self.get_huffman_code_tab(ele_node.right, '1', code_str)
        else: # is a leaf node
            self.huffman_code_tab[chr(ele_node.data)] = code_str

    def huffman_zip(self, s: str) -> list:
        """
        Use the Huffman encoding table to convert each character in the string into the corresponding encoding
        That is, compress a string according to Huffman coding to obtain a compressed result.
        :param s: string to convert
        :return: Returns the encoded list
        """
        res = ''
        # Traverse the string, convert each character into the corresponding encoding, and concatenate all the encodings
        # "i like like like java do you like a java" => The following form
        # 1100111111101110001011111110111000101111111011100010100001011000101011
        # 001100001011001000011001110111111101110001011010100001011000101
        for i in s:
            res + = self.huffman_code_tab[i]
        # Split the resulting encoded string into eight bits, convert each eight bits into an int, and store the int in a list
        code_list = []
        i = 0
        n = len(res)
        while i < n:
            num = int(res[i:i + 8], 2) # Convert binary string to integer
            code_list.append(num)
            i + = 8
            if i < n <= i + 8: # It has been divided into the last part, record the length of this part
                self.last_char_len = n - i

        return code_list

    def huffman_decode(self, code_list) -> str:
        """
        Decompress Huffman encoding and obtain a readable string
        :param code_list: Huffman code list to be decompressed
        :return: decoded string
        """
        # Convert the Huffman encoding list into the corresponding binary string
        # [415, 476, 95, 476, 95, 476, 80, 177, 345, 389, 400, 206, 254, 226, 212, 44, 5] =>
        # 1100111111101110001011111110111000101111111011100010100001011000101011
        # 001100001011001000011001110111111101110001011010100001011000101
        code_str = '' # Store the corresponding binary string
        count = 0
        n = len(code_list)
        for i in code_list:
            t = "{:08b}".format(i) # Convert integer to binary string
            code_str + = t
            count + = 1
            if count == n - 1:
                break
        code_str + = "{:0{k}b}".format(code_list[count], k=self.last_char_len)

        # Swap the key values of the Huffman coding table
        # For example, the original is 'a': '001' => becomes '001': 'a'
        code_tab = {}
        for k, v in self.huffman_code_tab.items():
            code_tab[v] = k

        # Traverse binary string
        j = 0
        i=1
        n = len(code_str)
        res_code = '' #Decoded string
        while i <= n:
            t = code_str[j:i]
            ch = code_tab.get(t)
            if ch:
                res_code + = ch
                j=i
            i+=1
        return res_code


s = "i like like like java do you like a java"
# s = "I love python hahha nihao"
huffman = HuffmanCode()
root_node = huffman.create_huffman_tree(s)
huffman.get_huffman_code_tab(root_node, '', '')
huffman_code_list = huffman.huffman_zip(s)
decode_str = huffman.huffman_decode(huffman_code_list)
print(decode_str)

Notes on using Huffman coding to compress files (code is better than omitted)