Scanning scheme for default file system types

1. Introduction

Normally, when we need to traverse the entire disk, we will choose to use the interface provided by the system, or call the filesystem module in the standard library or boost library.

The traversal method of the filesystem library is relatively simple, so this article only lists the usage of the system traversal interface.

2. Windows

Traverse directories using the FindFirstFile/FindNextFile/FindClose functions and the breadth-first algorithm.

HANDLE hFind;
WIN32_FIND_DATA find_data;
vec_dir.emplace_back(path);

os_string directory;
while (!vec_dir.empty()) {<!-- -->
  directory = vec_dir.back() + L"\*";
  vec_dir.pop_back();

  hFind = FindFirstFile(directory.c_str(), & amp;find_data);
  if (hFind != INVALID_HANDLE_VALUE) {<!-- -->
    do {<!-- -->
      // Skip system directories
      if (find_data.cFileName[0] == L'$' ||
          wcscmp(find_data.cFileName, L".") == 0 ||
          wcscmp(find_data.cFileName, L"..") == 0)
        continue;

      // Ignore compressed files
      if (find_data.dwFileAttributes & FILE_ATTRIBUTE_COMPRESSED ||
          find_data.dwFileAttributes & FILE_ATTRIBUTE_ENCRYPTED)
        continue;

      os_string filepath =
          directory.substr(0, directory.length() - 1) + find_data.cFileName;
      if (find_data.dwFileAttributes & amp; FILE_ATTRIBUTE_DIRECTORY) {<!-- -->
        if (find_data.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) {<!-- -->
          continue;
        } else {<!-- -->
          vec_dir.push_back(filepath);
        }
      } else {<!-- -->
        b64 filesize = find_data.nFileSizeLow | (b64)find_data.nFileSizeHigh
                                                    << 32;

        // deal with file content
      }
    } while (FindNextFile(hFind, & amp;find_data));
  }
  FindClose(hFind);
}

Of course, if you actually know something about the underlying calls of the Windows operating system, you can know that the above-mentioned functions are actually implemented in the kernel by calling the NtQueryDirectoryFile function.

Therefore, you can choose to call the NtQueryDirectoryFile function directly to improve efficiency.

scope::ScopedModule ntdll(LoadLibrary(L"ntdll.dll"));
pNtQueryDirectoryFile_ =
      (PNTQUERYDIRECTORYFILE)GetProcAddress(ntdll, "NtQueryDirectoryFile");

vector<os_string> vec_dir;
vec_dir.emplace_back(path);
os_string directory;
scope::ScopedPtr<b8> query_buffer;
ulong query_buffer_size;
query_buffer_size =
    sizeof(FILE_DIRECTORY_INFORMATION) + MAX_PATH * sizeof(WCHAR);
query_buffer_size *= 16;

query_buffer = new b8[query_buffer_size];
if (!query_buffer) {<!-- -->
  return false;
}

while (!vec_dir.empty()) {<!-- -->
  directory = vec_dir.back() + L"\";
  vec_dir.pop_back();

  scope::ScopedHandle hFind(
      CreateFile(directory.c_str(), SYNCHRONIZE | FILE_LIST_DIRECTORY,
                 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
                 OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL));
  if (hFind != INVALID_HANDLE_VALUE) {<!-- -->
    IO_STATUS_BLOCK IoStatusBlock;
    do {<!-- -->
      NTSTATUS nt_status = pNtQueryDirectoryFile_(
          hFind, NULL, NULL, NULL, & amp;IoStatusBlock, query_buffer,
          query_buffer_size, FileDirectoryInformation, FALSE, NULL, FALSE);
      if (nt_status == 0) {<!-- -->
        PFILE_DIRECTORY_INFORMATION file_info =
            (PFILE_DIRECTORY_INFORMATION)query_buffer.get();
        for (; (b8*)file_info < query_buffer.get() + query_buffer_size;
             file_info = (PFILE_DIRECTORY_INFORMATION)(
                 ((b8*)file_info) + file_info->NextEntryOffset)) {<!-- -->
          os_string filepath(path);
          b64 file_size = file_info->EndOfFile.QuadPart;
          if (file_info->FileName[0] == L'$' ||
              (wcsncmp(file_info->FileName, L".", 1) == 0 & amp; & amp;
               file_info->FileNameLength == 2) ||
              (wcsncmp(file_info->FileName, L"..", 2) == 0 & amp; & amp;
               file_info->FileNameLength == 4))
            goto NEXT;

          if (file_info->FileAttributes & FILE_ATTRIBUTE_COMPRESSED ||
              file_info->FileAttributes & FILE_ATTRIBUTE_ENCRYPTED)
            goto NEXT;

          filepath.append(file_info->FileName,
                          file_info->FileNameLength / sizeof(WCHAR));
          if (file_info->FileAttributes & amp; FILE_ATTRIBUTE_DIRECTORY) {<!-- -->
            if (file_info->FileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) {<!-- -->
              goto NEXT;
            }
            vec_dir.push_back(filepath);
          } else {<!-- -->
            // deal with file content
          }
        NEXT:
          if (!file_info->NextEntryOffset) {<!-- -->
            break;
          }
        }
      } else {<!-- -->
#define STATUS_NO_SUCH_FILE 0xC000000FL
#define STATUS_NO_MORE_FILES 0x80000006L
        if (nt_status == STATUS_NO_MORE_FILES ||
            nt_status == STATUS_NO_SUCH_FILE) {<!-- -->
          break;
        }
      }
    } while (true);
  }
}

3.Linux

On the Linux platform, when it comes to directory traversal, the first thing that comes to mind must be the opendir/readdir/closedir function.

vector<os_string> vec_dir;
vec_dir.push_back(path);

struct dirent* p_file_info = NULL;
scope::ScopedDir p_dir;

os_string directory;
while (!vec_dir.empty()) {<!-- -->
  directory = vec_dir.back();
  vec_dir.pop_back();

  p_dir = opendir(directory.c_str());
  if (p_dir != NULL) {<!-- -->
    while (NULL != (p_file_info = readdir(p_dir))) {<!-- -->
      if (strcmp(p_file_info->d_name, ".") == 0 ||
          strcmp(p_file_info->d_name, "..") == 0)
        goto NEXT;

      if (p_file_info->d_type == DT_FIFO || p_file_info->d_type == DT_SOCK ||
          p_file_info->d_type == DT_LNK || p_file_info->d_type == DT_CHR ||
          p_file_info->d_type == DT_BLK)
        goto NEXT;

      os_string filepath = directory + p_file_info->d_name;
      if (p_file_info->d_type == DT_DIR) {<!-- -->
        // Check whether the current directory is a mount point
        if (p_disk_scanner_->CheckFileSystemMounted(filepath)) {<!-- -->
          goto NEXT;
        }
        vec_dir.push_back(filepath);
      } else {<!-- -->
        // deal with file content
      }
    NEXT:
      p_disk_scanner_->UpdateScannedSpace(1);
      p_disk_scanner_->UpdateScannedProcess(1);
    }
  }
}