21.10 Python uses CRC32 to verify files

CRC file check is a method used to verify file integrity by calculating the file’s CRC value and comparing it with a pre-calculated CRC check value. Whether files have changed, this function can be used to verify whether files in a directory have changed. If changes have occurred, we can print out the changes. This function can be used to verify a specific directory.

First, implement the file and directory traversal function and recursively output files or directories. There are two implementation methods in Python. We can implement it through the built-in os.walk function. It can also be implemented using os.listdir. Here I encapsulate two functions in turn. The function ordinary_all_file uses the first method, and the function recursion_all_file uses the second method. Both methods return a _file list, which readers can use to receive the output data set.

import os,hashlib,time,datetime
from zlib import crc32
import argparse

# Recursive version traverses all files and directories
def recursion_all_file(rootdir):
    _file = []
    root = os.listdir(rootdir)
    for item in range(0,len(root)):
           path = os.path.join(rootdir,root[item])
           if os.path.isdir(path):
              _file.extend(recursion_all_file(path))
           if os.path.isfile(path):
              _file.append(path)

    for item in range(0,len(_file)):
        _file[item] = _file[item].replace("\","/")
    return_file

# Directory traversal implemented through functions in the built-in OS library
def ordinary_all_file(rootdir):
    _file = []
    for root, dirs, files in os.walk(rootdir, topdown=False):
        for name in files:
            _file.append(os.path.join(root, name))
        for name in dirs:
            _file.append(os.path.join(root, name))
            
        for item in range(0,len(_file)):
            _file[item] = _file[item].replace("\","/")
    return_file

There are also two calculation methods provided here. The first Calculation_md5sum uses the md5() method in the hashlib module to calculate the MD5 feature, the second type Calculation_crc32 uses the crc32 method in the zlib library to calculate the CRC32< of a specific file /code> value as shown below.


# Read the file through the hashlib module and calculate the MD5 value
def Calculation_md5sum(filename):
    try:
        fp = open(filename, 'rb')
        md5 = hashlib.md5()
        while True:
            temp = fp.read(8096)
            if not temp:
                break
            md5.update(temp)
        fp.close()
        return (md5.hexdigest())
    exceptException:
        return 0

# Calculate target CRC32
def Calculation_crc32(filename):
    try:
        with open(filename,"rb") as fp:
            crc = crc32(fp.read())
            while True:
                temp = fp.read(8196)
                if not temp:
                    break
            return crc
    exceptException:
        return 0
    return 0

In the main function, we pass in parameters through the argparse parsing library and implement three functions respectively. Among them, the dump function can be used to save the hash of files in a specific directory.  value to the dump.json file. Secondly, the check function can be used to check whether the file has been modified based on the content in dump.json However, the last set can be used to set the timestamps of files in batches. These three types of functions are all commonly used.

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode",dest="mode",help="Specify the method name to be used, (set/dump/check)")
    parser.add_argument("-d","--dir",dest="dir",help="Specify a file directory (non-file) that needs to be traversed")
    parser.add_argument("-f","--files",dest="files",help="Specify a local snapshot file, or dump file name")
    parser.add_argument("-t","--time",dest="time",help="Specify the file time that needs to be modified uniformly")
    args = parser.parse_args()

    # Save snapshot: main.py --mode=dump -d "D:/lyshark" -f dump.json
    if args.mode == "dump" and args.dir and args.files:
        file = recursion_all_file(args.dir)
        fp = open(args.files,"w + ")
        for item in file:
            Single = []
            Single.append(Calculation_crc32(item))
            Single.append(item)
            fp.write(str(Single) + "\\
")
            print("[ + ] CRC: {} ---> Path: {}".format(Single[0],Single[1]))
        fp.close()

    # Check file integrity: main.py --mode=check -d "D:/lyshark" -f dump.json
    elif args.mode == "check" and args.dir and args.files:
        fp = open(args.files,"r")
        for item in fp.readlines():
            _list = eval(item)

            # Take out the directory in the json file for MD5 calculation
            _md5 = Calculation_crc32(_list[1])
            # If the md5 of the file is inconsistent with the record in the database, it means it has been modified.
            if _list[0] != _md5 and _md5 != 0:
                print("[-] Exception file: {}".format(_list[1]))
            elif _md5 == 0:
                print("[x] File missing: {}".format(_list[1]))

    # Set file modification time: main.py --mode=set -d "D:/lyshark" -t "2019-01-01 11:22:30"
    elif args.mode == "set" and args.dir and args.time:
        _list = ordinary_all_file(args.dir)
        _time = int(time.mktime(time.strptime(args.time,"%Y-%m-%d %H:%M:%S")))
        for item in _list:
            os.utime(item,(_time, _time))
            print("[ + ] timestamp: {} ---> path: {}".format(str(_time),item))
    else:
        parser.print_help()

Specify mode mode as dump to calculate the CRC feature of a specific file and save the feature to dump.json code> file, as shown below;

Specify mode mode as check and specify the dump.json file before dumping, which can be used to verify whether there are abnormal files in the current directory. If If the file characteristic value changes, an abnormal file will be prompted. If the file is deleted or renamed, the file will be lost, as shown in the figure below;

Specify the mode mode as set to modify the time parameters of specific files in a specific directory, for example, set the files in the d://lyshark directory To reset all timestamps to 2019-01-01 11:22:30, you can execute the following command. After execution, readers can observe the file time changes by themselves, as shown in the figure below;


The file and directory traversal function can not only be used to scan the characteristics of files, but can also be used with functions such as fopen to scan specific content in specific files. The following is a key to implementing specific directories in files. For word scanning, after running, the reader can pass in the path to be scanned, the keywords to be scanned, and the file type to be scanned.
import os,re
import argparse

def spider(script_path,script_type):
    final_files = []
    for root, dirs, files in os.walk(script_path, topdown=False):
            for fi in files:
                dfile = os.path.join(root, fi)
                if dfile.endswith(script_type):
                    final_files.append(dfile.replace("\","/"))
    print("[ + ] Found {} {} files".format(len(final_files),script_type))
    return final_files

def scanner(files_list,func):
    for item in files_list:
        fp = open(item, "r",encoding="utf-8")
        data = fp.readlines()
        for line in data:
            Code_line = data.index(line) + 1
            Now_code = line.strip("\\
")
            #for unsafe in ["system", "insert", "include", "eval","select \*"]:
            for unsafe in [func]:
                flag = re.findall(unsafe, Now_code)
                if len(flag) != 0:
                    print("Function: {} ---> Function line: {} ---> Path: {} " .\
                          format(flag,Code_line,item))

if __name__ == "__main__":
    # Usage: main.py -p "D://lyshark" -w eval -t .php
    parser = argparse.ArgumentParser()
    parser.add_argument("-p","--path",dest="path",help="Set scan path")
    parser.add_argument("-w","--word",dest="func",help="Set the search keyword")
    parser.add_argument("-t","--type",dest="type",default=".php",help="Set the scan file type, default php")
    args = parser.parse_args()
    if args.path and args.func:
        ret = spider(args.path, args.type)
        scanner(ret, args.func)
    else:
        parser.print_help()

As shown in the figure below, we pass in d://lyshark and the keyword gumbo_normalized_tagname and set the scanning suffix type *.c when the program runs After that, all files that meet the conditions in the directory can be output, and the line where the function is located can be output, which helps us quickly jump and analyze the data.