+1 (315) 557-6473 

How to write an assignment on Huffman coding in C : Tips with example

A complete guide for students working on assignment on lossless data compression algorithm. Best coding practices have been followed so that students can get maximum grades.

Assignment Description

A lossless data compression algorithm is huffman coding. For each character entered into this algorithm, a variable-length code is assigned. The frequency of character use affects the code length. The codes for the most common characters are the shortest, while the codes for the least common characters are longer.

 Understanding the given code with a simple example:

Let's say we want to save a data file with 100,000 characters. There are just 6 characters in the file, and they appear at the following frequencies:

a=45

b=13

c=12

d=16

e=9

f=5

#include

#include

#include

typedef struct node_t {

    struct node_t *left;

    struct node_t *right;

    char c;

    char code[10];

    int freq;

} NODE;

NODE *pqueue[256], **pq = pqueue-1;

NODE *savequeue[256];

NODE pool[256];

int farr[256] = {0};

char code_arr[256][10] = {{0}};

int size = 0, nsize = 0, qsize = 1;

int savesize;

char code[10] = {0};

NODE* new_node(int freq, char c, NODE *a, NODE *b);

void pq_insert(NODE *node);

NODE *pq_remove();

int print_tree(NODE *node, char *str, int len);

int main(int argc, char **argv)

{

    int uncompressed = 0;

    int compressed_size = 0;

    char characters[256];

    char characterCodes[256][10];

    int characterFrequencies[256];

    int numOfCharacters;

    char inputTextFilePath[100];

    char codeTableFilePath[100];

    char outputTextFilePath[100];

    //int buffer_size;

    char buffer[1000];

    if (argc != 5) {

        printf("Usage: %s encode|decode \n", argv[0]);

        exit(0);

Each character is encoded as a binary string or codeword using a binary code. We are looking for a binary code that compresses the file as much as feasible by encoding it with the fewest number of bits.

Each codeword in a fixed-length code is the same length. Codeword length can vary in a variable-length code

    }

    strcpy(inputTextFilePath, argv[2]);

    strcpy(codeTableFilePath, argv[3]);

    strcpy(outputTextFilePath, argv[4]);

    if (!strcmp(argv[1], "encode")) {

        FILE *inputFile = fopen(inputTextFilePath, "r");

        if (inputFile == NULL)

        {

           printf("Could not open file to read: %s\n",inputTextFilePath);

         return 1;

        }

        char c;

        while ((c = fgetc(inputFile)) != EOF && c!='\n')

        {

            farr[c]++;

            buffer[uncompressed++] = c;

      //do whatever you would like to do with this character now!!

        }

        fclose(inputFile);

        for (int i =0; i < 256; i++)

        {

            if (farr[i] != 0) {

                //printf("char = %c, freq = %d\n", i, farr[i]);

                NODE *n = new_node(farr[i], i, 0, 0);

                pq_insert(n);

            }

        }

        memcpy(savequeue, pqueue, sizeof(pqueue));

        savesize = qsize;

        while (qsize > 2) {

            NODE *a = pq_remove();

            NODE *b = pq_remove();

            pq_insert(new_node(0, 0, a, b));

        }

Now to understand the encoding part of huffman coding.

It is simple to encode the message when given a code (matching to some alphabet) and a message. Simply substitute the codewords for the characters.

For instance: = a, b, c, d

if the code is

C1{A = 00, B = 01, C = 10, D = 11}

If the code is

C2 = {a = 0, b = 110, c = 10, d = 111}

then bad is encoded like 1101111

        // Traverse the tree and calculate the compressed_size

        char code[10] = {0};

        compressed_size = print_tree(&pool[nsize-1], code, 0);

        pq = savequeue - 1;

        numOfCharacters = savesize - 1;

        for (int i = savesize-1, j = 0; i > 0; i--, j++) {

            if (pq[i]->freq == 0) continue;

            strcpy(code_arr[pq[i]->c], pq[i]->code);

            characters[i-1] = pq[i]->c;

            strcpy(characterCodes[i-1], pq[i]->code);

            characterFrequencies[i-1] = pq[i]->freq;

        }

        FILE *codeTableFile = fopen(codeTableFilePath, "w");

        if (codeTableFile == NULL)

        {

            printf("Could not open file to write: %s\n",codeTableFilePath);

            return 1;

        }

        for(int i = numOfCharacters - 1; i >= 0; i--)

        {

            fprintf(codeTableFile, "%c,%s,%d\n", characters[i], characterCodes[i], characterFrequencies[i]);

        }

        fclose(codeTableFile);

        printf("Original: %d bits\n", uncompressed*8);

        printf("Compressed: %d bits\n", compressed_size);

        printf("Compression Ratio: %.2f%%\n", (float)compressed_size/((float)uncompressed*8)*100);

        FILE *outputTextFile = fopen(outputTextFilePath, "w");

        for (int i = 0; i < uncompressed; i++) {

            fprintf(outputTextFile, "%s", code_arr[buffer[i]]);

        }

        fclose(outputTextFile);

    }

    else if (!strcmp(argv[1], "decode")) {

        nsize = 0;

        NODE *root = &pool[nsize++];

        NODE *currptr = root;

        strcpy(codeTableFilePath, argv[2]);

        strcpy(inputTextFilePath, argv[3]);

        strcpy(outputTextFilePath, argv[4]);

        FILE *codeTableFile = fopen(codeTableFilePath, "r");

        if (codeTableFile == NULL)

        {

            printf("Could not open file to read: %s\n",codeTableFilePath);

            return 1;

        }

        char c;

        char cod[300], line[200];

        int frq, ret;

        while (fgets(line, sizeof(line), codeTableFile) != NULL) {

            char *ptr = line;

            while (ptr) {

                if (*ptr == '\n') break;

                if (*ptr == ',') *ptr = ' ';

                ptr++;

            }

For the decoding part of this procedure.

C1 = {a = 00, b = 01, c = 10, d = 11}.

C2 = {a = 0, b = 110, c = 10, d = 111}.

C3 = {a = 1, b = 110, c = 10, d = 111}

Decoding a message that has been encoded is the procedure.

of transforming it into the initial message.

If there is just one way to decode a message, it is uniquely decodable (in relation to a specific code).

For instance, 010011 is only decodable to bad with respect to C1.

1101111 is uniquely decodable to bad with respect to C2.

Regarding C3, however, 1101111 cannot be uniquely decoded because it might have encoded either bad or acad.

It is possible to demonstrate that each message is encoded.

C1 and C2 can only be decoded in this way. the distinct

To be decipherable, a text must possess this feature.

            sscanf(line, "%c %s %d", &c, code, &frq);

            currptr = root;

            for (int i = 0; i < strlen(code); i++) {

                NODE *newnode = &pool[nsize++];

                newnode->left = NULL;

                newnode->right = NULL;

                newnode->c = 0;

                if (code[i] == '0') {

                    if (!currptr->left) currptr->left = newnode;

                    else nsize--;

                    currptr = currptr->left;

                } else if (code[i] == '1') {

                    if (!currptr->right) currptr->right = newnode;

                    else nsize--;

                    currptr = currptr->right;

                }

            }

            currptr->c = c;

            currptr->freq = frq;

        }

        fclose(codeTableFile);

        FILE *inputTextFile = fopen(inputTextFilePath, "r");

        NODE *ptr = root;

        int idx = 0;

        char outbuf[1000];

        while ((c = fgetc(inputTextFile)) != EOF) {

            if (c == '0') ptr = ptr->left;

            else if (c == '1') ptr = ptr->right;

            if (!ptr->left && !ptr->right) {

               outbuf[idx++] = ptr->c;

               ptr = root;

            }

        }

        outbuf[idx] = 0;

        printf("decoded = %s\n", outbuf);

        fclose(inputTextFile);

        FILE *outputTextFile = fopen(outputTextFilePath, "w");

        fprintf(outputTextFile, "%s", outbuf);

        fclose(outputTextFile);

    }

    return 0;

}

NODE* new_node(int freq, char c, NODE *a, NODE *b)

{

    NODE *node = &pool[nsize++];

    if (freq != 0) {

        node->c = c;

        node->freq = freq;

    } else {

        node->c = c;

        node->left = a;

        node->right = b;

        node->freq = a->freq + b->freq;

    }

    return node;

Fixed-length codes can always be cracked uniquely (why). We previously observed that these don't always provide the best. We prefer to utilise variable length codes because of compression.

Priority Code: A prefix (free) code is one that:

No prefix of another codeword exists.

As an illustration, "a = 0, b = 110, c = 10, d = 111"

a code prefix.

Fact: Each message is encoded with a prefix.

Free code can only be broken once. Since no codeword is a prefix of any other we can always find

off the initial codeword in a message, then proceed. decoding. Example:

01101100 = 01101100 = abba

Therefore, finding good (optimal compression) prefix-free codes is of interest to us.

}

void pq_insert(NODE *node)

{

    int j, i = qsize++;

    pq[i] = node;

    for (int j = 1; j < i; j++) {

        if (pq[j]->freq > pq[i]->freq ||

            (pq[j]->freq == pq[i]->freq && pq[j]->c > pq[i]->c)) {

            for (int k = i; k > j; k--) {

                pq[k] = pq[k-1];

            }

            pq[j] = node;

            break;

        }

    }

}

NODE *pq_remove()

{

    int i, j;

    NODE *node = pq[i = 1];

    if (qsize < 2) return 0;

    qsize--;

    while ((j = i + 1) < qsize) {

        pq[i] = pq[j];

        i = j;

    }

    pq[i] = pq[qsize];

    return node;

}

int print_tree(NODE *ptr, char *str, int len)

{

    static int comp_size = 0;

    if (!ptr) return comp_size;

    if (!ptr->left && !ptr->right) {

        strcpy(ptr->code, str);

        comp_size += strlen(str) * ptr->freq;

        //printf("node: %c - %s - %d\n", ptr->c, str, ptr->freq);

        len = 0;

    } else {

        str[len] = '0';

        str[len+1] = 0;

        print_tree(ptr->left, str, len+1);

        str[len] = '1';

        str[len+1] = 0;

        print_tree(ptr->right, str, len+1);

    }

    return comp_size;

}

Hence the c code for the assignment on Huffman coding is completed after performing the procedure above we followed. We will get the desired output.

Steps of Huffman Coding.

Step 1: Pick two letters x, y from alphabet A with the smallest frequencies and create a subtree that has these two characters as leaves. (greedy idea) Label the root of this subtree as z.

Step 2: Set frequency f(z) = f(x) + f(y). Remove x, y and add z creating new alphabet A0 = A ∪ {z} − {x, y}. Note that |A0 | = |A| − 1. Repeat this procedure, called merge, with new alphabet A0 until an alphabet with only one symbol is left. The resulting tree is the Huffman code.


Comments
No comments yet be the first one to post a comment!
Post a comment