Harbin Institute of Technology Information Content Security Experiment 3–String Matching

1. Experimental purpose

Be familiar with the AC or WM multi-pattern matching algorithm, implement the AC or WM string matching algorithm in a familiar high-level language, and evaluate the space and time complexity

2. Experimental requirements

PS. I only did what I had to do and chose the AC

[Must Do] 1: Implement the multi-pattern matching algorithm yourself according to the algorithm learned in class. You can choose one of AC or WM to implement.

[Required] 2: The test data set includes pattern1w, pattern2w, and pattern3w, which are pattern sets of different sizes. Each line is a pattern; the text file is a test data set, used to match the patterns in the pattern set. The program required to be implemented can load three pattern sets respectively and complete the initialization. Perform string matching on text in text files.

[Must-do] 3: Output: the number of patterns accurately matched by the three pattern sets and the content of the matched patterns.

[Required] 4: Output: the time spent in the initialization phase and the matching phase of the three pattern sets, and analyze them.

3. Design Thought

1. Logical design

(1) Tire tree construction

1) Initialize the root node.

2) Read the pattern strings and insert them into the Trie tree one by one. Each node represents a character and the path represents

Pattern string.

(2) Construct failure function

1) Use the BFS algorithm to construct the failure function of each node. These pointers point to the node with the longest valid suffix in the Trie tree.

(3) Pattern string search

1) Traverse the text and use Trie tree and invalidation function to match the pattern string.

2) Output all found pattern strings and their positions.

(4) Performance timing

1) Record the time of preprocessing pattern string and invalidation function.

2) Record the time of searching text.

(5) Clean up resources

1) Release the memory resources occupied by the Trie tree.

2) Release the memory resources occupied by text data.

(6) Main function

1) Create the root node of the Trie tree.

2) Based on user input, select the pattern file and read the pattern string from it.

3) Start timing, insert the pattern string into the Trie tree, and build the failure function.

4) End the timing and output the time taken to build the Trie tree and the failure function.

5) Read text files and prepare text data for search.

6) Start timing and use the AC automaton to search for pattern strings in the text.

7) End the timer and output the search time and the total number of matches found.

8) Release all allocated memory resources.

9) The program ends.

4. Code implementation

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_PATTERNS 100000 // Assume that there are at most 100000 patterns in the pattern set
#define MAX_PATTERN_LENGTH 500 assumes the maximum length of each pattern is 500
#define ALPHABET_SIZE 256 //ASCII character set size


//define the structure of the node
typedef struct Node {
    struct Node* children[ALPHABET_SIZE];
    struct Node* fail;
    int is_end_of_word;
    char* pattern; //Used to store the complete pattern string matched to reach this state
} Node;



//Create and initialize nodes
Node* create_node() {
    Node* new_node = (Node*)malloc(sizeof(Node));
    int i;
    for (i = 0; i < ALPHABET_SIZE; i + + ) {
        new_node->children[i] = NULL;
    }
    new_node->fail = NULL;
    new_node->is_end_of_word = 0;
    new_node->pattern = NULL;
    return new_node;
}



//Insert a pattern string into the trie tree
void insert_pattern(Node* root, const char* pattern) {
    Node* current = root; //current points to the current node
    const char* original_pattern = pattern; //Save the starting address of pattern
    while (*pattern) {
        if (current->children[*pattern] == NULL) {
            current->children[*pattern] = create_node();
        }
        current = current->children[*pattern];
        pattern + + ;
    }
    current->is_end_of_word = 1;
    current->pattern = _strdup(original_pattern);//Store the pattern string in the node
}



//Build an invalidation function (implemented using linked lists and queues)
//Define linked list nodes and queue structures
typedef struct QueueNode {
    Node* trie_node;
    struct QueueNode* next;
} QueueNode;

typedef struct {
    QueueNode* front;
    QueueNode* rear;
} Queue;

//Create queue
Queue* createQueue() {
    Queue* q = (Queue*)malloc(sizeof(Queue));
    q->front = q->rear = NULL;
    return q;
}

//Enqueue operation
void enqueue(Queue* q, Node* node) {
    QueueNode* temp = (QueueNode*)malloc(sizeof(QueueNode));
    temp->trie_node = node;
    temp->next = NULL;

    if (q->rear == NULL) {
        q->front = q->rear = temp;
        return;
    }

    q->rear->next = temp;
    q->rear = temp;
}

// Dequeue operation
Node* dequeue(Queue* q) {
    if (q->front == NULL) {
        return NULL;
    }

    QueueNode* temp = q->front;
    Node* node = temp->trie_node;
    q->front = q->front->next;

    if (q->front == NULL) {
        q->rear = NULL;
    }

    free(temp);
    return node;
}

// Check if the queue is empty
int isEmpty(Queue* q) {
    return q->front == NULL;
}

// Release queue memory
void freeQueue(Queue* q) {
    while (!isEmpty(q)) {
        dequeue(q);
    }
    free(q);
}

//Construct failure function
void build_fail_pointers(Node* root) {
    // Use BFS algorithm to construct fail pointer
    Node*temp;
    Node* fail_node;
    Queue* queue = createQueue(); // Queue implemented using linked list

    //The failure functions of the nodes in the first layer all point to the root node
    int i;
    for (i = 0; i < ALPHABET_SIZE; i + + ) {
        if (root->children[i] != NULL) {
            root->children[i]->fail = root;
            enqueue(queue, root->children[i]);//BFS adds all child nodes of the root node to the queue
        }
    }

    while (!isEmpty(queue)) {
        temp = dequeue(queue); // The invalidation function of temp is known, that is, it is not empty
        for (i = 0; i < ALPHABET_SIZE; i + + ) {
            if (temp->children[i] != NULL) {
                fail_node = temp->fail;
                //If the corresponding child node of the node pointed to by the current node's failure function is empty, trace the failure function pointed to by the failure function
                while (fail_node != NULL & amp; & amp; fail_node->children[i] == NULL) {
                    fail_node = fail_node->fail;
                }
                //Trace all the way back to the failure function pointing to null, that is, the failure function of the root node
                if (fail_node == NULL) {
                    temp->children[i]->fail = root;
                }
                //Or find a suitable non-empty child node and assign this child node to the invalidation function of the current node's child node
                else {
                    temp->children[i]->fail = fail_node->children[i];
                }
                //Enqueue the child node
                enqueue(queue, temp->children[i]);
            }
        }
    }

    freeQueue(queue); // Release the memory of the queue
}



//Search for patterns in the trie tree in the given text
int search_text(Node* root, const char* text) {
    Node* current = root;
    int match_count = 0;
    while (*text) {
        //If the current node is not empty and there is no branch corresponding to the current character in the child nodes of the current node, enter the loop
        while (current != NULL & amp; & amp; current->children[*text] == NULL) {
            //Backtrack upward in the Trie tree and move to other branches through the fail pointer
            current = current->fail;
        }
        // If the current node is empty, it has been traced back to the root node and the corresponding branch has not been found.
        if (current == NULL) {
            current = root;//Start a new round of matching
        }
        else {
            current = current->children[*text];
        }
        Node* temp = current;
        while (temp != root & amp; & amp; temp->is_end_of_word) {
            printf("Pattern string found: %s\
", temp->pattern);
            match_count + + ;
            // Move the temporary node to the node pointed to by its invalidation function to check for other possible matches
            temp = temp->fail;
        }
        text + + ;
    }
    return match_count;
}



//Release all memory allocated in the trie tree
void free_memory(Node* root) {
    if (root == NULL) {
        return;
    }
    int i;
    for (i = 0; i < ALPHABET_SIZE; i + + ) {
        free_memory(root->children[i]);
    }
    if (root->pattern != NULL) {
        free(root->pattern);
    }
    free(root);
}



int main() {
    Node* root = create_node();
    char pattern[MAX_PATTERN_LENGTH];
    FILE* pattern_file=NULL;
    FILE* text_file;
    char* text_data;
    long length;
    int match_count;
    clock_t start_time_p, end_time_p, start_time_t, end_time_t;
    errno_t err;
    int p=0;

    while (p != 1 & amp; & amp; p != 2 & amp; & amp; p != 3)
    {
        printf("Please enter the pattern file number (1~3):\
");
        //scanf("%d", & amp;p);
        scanf_s("%d", & amp;p, sizeof(p));
        //printf("Please enter the serial number of the file to be tested:\
");
        //scanf("%d", & amp;t);
        //scanf_s("%s", & amp;t, sizeof(t));
        if (p == 1) {
            //Load the pattern set and build the AC automaton
            err = fopen_s( & amp;pattern_file, "pattern1w.txt", "r");
            if (err != 0) {
                fprintf(stderr, "Cannot open 'pattern1w.txt'\
");
                return 1;
            }
        }
        if (p == 2) {
            //Load the pattern set and build the AC automaton
            err = fopen_s( & amp;pattern_file, "pattern2w.txt", "r");
            if (err != 0) {
                fprintf(stderr, "Cannot open 'pattern2w.txt'\
");
                return 1;
            }
        }
        if (p == 3) {
            //Load the pattern set and build the AC automaton
            err = fopen_s( & amp;pattern_file, "pattern3w.txt", "r");
            if (err != 0) {
                fprintf(stderr, "Cannot open 'pattern3w.txt'\
");
                return 1;
            }
        }
    }
    start_time_p = clock();
    while (fgets(pattern, MAX_PATTERN_LENGTH, pattern_file)) {
        // Remove possible newlines
        pattern[strcspn(pattern, "\r\
")] = 0;
        insert_pattern(root, pattern);
    }
    build_fail_pointers(root);
    end_time_p = clock();
    //printf("Initializing pattern1w takes: %f seconds\
", (double)(end_time_p - start_time_p) / CLOCKS_PER_SEC);
    fclose(pattern_file);

    //Read text data
    //text_file = fopen("text.txt", "r");
    err = fopen_s( & amp;text_file, "text.txt", "r");
    if (err != 0) {
        fprintf(stderr, "Cannot open 'text.txt'\
");
        return 1;
    }
    //if (!text_file) {
     // fprintf(stderr, "Error opening text file.\
");
      // fclose(pattern_file);
        //free_memory(root);
        //return 1; //Exit
        //program
    //}
    fseek(text_file, 0, SEEK_END);
    length = ftell(text_file);
    fseek(text_file, 0, SEEK_SET);
    text_data = (char*)malloc(length + 1); // Explicit conversion to char*
    
    if (text_data) {
        //fread(text_data, 1, length, text_file);
        size_t bytes_read = fread(text_data, 1, length, text_file);
        text_data[bytes_read] = '\0'; // Set the string terminator
    }
    else {
        fprintf(stderr, "Failed to allocate memory for text data.\
");
        fclose(text_file);
        fclose(pattern_file);
        free_memory(root);
        return 1; //Exit the program
    }
    fclose(text_file);

    //Search for matches
    start_time_t = clock();
    match_count = search_text(root, text_data);
    end_time_t = clock();
    if (p == 1) {
        printf("Initializing pattern1w takes: %f seconds\
", (double)(end_time_p - start_time_p) / CLOCKS_PER_SEC);
        printf("Searching time for pattern1w: %f seconds\
", (double)(end_time_t - start_time_t) / CLOCKS_PER_SEC);
    }
    if (p == 2) {
        printf("Initializing pattern2w takes: %f seconds\
", (double)(end_time_p - start_time_p) / CLOCKS_PER_SEC);
        printf("Searching time for pattern2w: %f seconds\
", (double)(end_time_t - start_time_t) / CLOCKS_PER_SEC);
    }
    if (p == 3) {
        printf("Initializing pattern3w takes: %f seconds\
", (double)(end_time_p - start_time_p) / CLOCKS_PER_SEC);
        printf("Searching time for pattern3w: %f seconds\
", (double)(end_time_t - start_time_t) / CLOCKS_PER_SEC);
    }
    //printf("Initializing pattern1w takes: %f seconds\
", (double)(end_time_p - start_time_p) / CLOCKS_PER_SEC);
    //printf("Searching time for pattern1w: %f seconds\
", (double)(end_time_t - start_time_t) / CLOCKS_PER_SEC);
    printf("Total number of matches found: %d\
", match_count);

    free(text_data);
    free_memory(root);

    return 0;
}