| /* |
| * Utility to verify a postgres base directory. |
| * |
| * This utility scans the 'base' data directory of a postgres instance to find |
| * directories and segment files. Each segment file is then checked for any |
| * pages that may have invalid checksums. Invalid checksums are determined by |
| * comparing the stored checksum against the current checksum. |
| * |
| * In order to use this utility, it is necessary to enable checksums with |
| * initdb. The flag for initdb is either -k or --data-checksums. Without, |
| * enabling checksums, it is not possible to use this utility. |
| * |
| */ |
| |
| /* |
| * TODO: |
| * - improve diagnostics and/or repair if corrupted pages found |
| * - create test that simulates/creates corruption |
| * - check directory permissions since sudo may be required |
| * - handle exceptions with setjmp or signals |
| * - aggregate and/or create a report of corrupted blocks |
| * - implement dump of corrupted pages ( hexdump? ) |
| * - run on multiple segment files simultaneously |
| * - detect if checksums enabled; regex for unset checksums? |
| * - throttling mechanism based on diskio/memory/cpu |
| * - convert this to an extension? |
| * - implement page dumping if checksum mismatch |
| */ |
| |
| /* standard header files */ |
| #include <errno.h> |
| #include <getopt.h> |
| #include <linux/limits.h> |
| #include <pthread.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| |
| /* headers for directory scanning */ |
| #include <dirent.h> |
| #include <stddef.h> |
| #include <string.h> |
| #include <sys/stat.h> |
| |
| /* postgres specific header files */ |
| #include "c.h" |
| #include "pg_config.h" |
| #include "postgres.h" |
| #include "storage/checksum_impl.h" |
| |
| #define MAX_DIR_LENGTH 256 |
| #define MAX_THREAD_COUNT 10 |
| |
| /* global flag values */ |
| int verbose = 0; |
| int dump_corrupted = 0; |
| bool parallel = false; |
| int num_directories = 0; |
| |
| // Variables used to control the multi-threading. |
| volatile int queue_index = 0; |
| pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; |
| |
| // SPECKLE_POSTGRES: Add a dummy function for builds with postgres assertions |
| // enabled. The useful definition is at cs/src/backend/utils/error/assert.c |
| // but this is not currently exported outside the postgres tree. |
| void ExceptionalCondition(const char *conditionName, const char *errorType, |
| const char *fileName, int lineNumber) { |
| abort(); |
| } |
| |
| /* |
| * Determine segment number by segment file name. For instance, if file |
| * name is /path/to/xxxx.7 procedure returns 7. Default return value is 0. |
| */ |
| static unsigned int |
| get_segment_number(const char* fileName) |
| { |
| int segnumOffset = strlen(fileName) - 1; |
| |
| if(segnumOffset < 0) |
| return 0; |
| |
| while(isdigit(fileName[segnumOffset])) { |
| segnumOffset--; |
| if(segnumOffset < 0) |
| return 0; |
| } |
| |
| if(fileName[segnumOffset] != '.') |
| return 0; |
| |
| return atoi(&fileName[segnumOffset+1]); |
| } |
| |
| static bool |
| is_page_corrupted(const char *page, BlockNumber blkno, const char *filename, |
| const char *dirpath) |
| { |
| /* Function checks a page header checksum value aginst the current |
| * checksum value of a page. NewPage checksums will be zero until they |
| * are set. There is a similar function PageIsVerified responsible for |
| * checking pages before they are loaded into buffer pool. |
| * see: src/backend/storage/page/bufpage.c |
| * |
| * Consider returning a negative value if page is new or checksum unset |
| * or if more detail for a page verifiction can be found. |
| */ |
| |
| PageHeader phdr = (PageHeader)page; |
| |
| /* calculating blkno needs to be aboslute so that subsequent segment files |
| * have the blkno calculated based on all segment files and not relative to |
| * the current segment file. |
| * see: https://git.postgresql.org/gitweb/?p=pg_filedump.git;a=commitdiff;h=052ed0112967dd1e9b0e2cbe54821c04475f1a3a;hp=b163cdaa53b651958cc8 |
| */ |
| |
| /* Segment size in bytes, BLCKSZ is 8192 by default, 8KB pages |
| * 1GB segment files are 131072 blocks of 8KB page size |
| * NOTE: pd_pagesize_version is BLCKSZ + version, since 8.3+, version is 4, |
| * resulting in pd_pagesize_version being 8196 when pagesize is 8KB |
| */ |
| static unsigned int segmentSize = RELSEG_SIZE * BLCKSZ; |
| |
| /* Number of current segment */ |
| static unsigned int segmentNumber = 0; |
| |
| if (parallel) |
| { |
| char full_filepath[MAX_DIR_LENGTH]; |
| sprintf(full_filepath, "%s/%s", dirpath, filename); |
| segmentNumber = get_segment_number(full_filepath); |
| } else |
| { |
| segmentNumber = get_segment_number(filename); |
| } |
| |
| /* segmentBlockOffset is the absolute blockNumber of the block when taking |
| * into account any previous segment files. |
| */ |
| uint32 segmentBlockOffset = RELSEG_SIZE * segmentNumber; |
| |
| uint16 checksum = pg_checksum_page((char *)page, segmentBlockOffset + blkno); |
| |
| bool corrupted = false; |
| |
| if (verbose) |
| { |
| printf("DEBUG: filename: %s/%s[%d]\n \ |
| \tsegmentBlockOffset: %d, maxSegmentSize: %d,\n \ |
| \tsegmentNumber: %d, relative blkno: %d, absolute blkno: %d,\n \ |
| \tchecksum: %x, phdr->pd_checksum: %x,\n \ |
| \tphdr->pd_flags: %d, phdr->pd_lower: %d, phdr->pd_upper: %d,\n \ |
| \tphdr->pd_special: %d, phdr->pd_pagesize_version: %d,\n \ |
| \tphdr->pd_prune_xid: %d\n", |
| dirpath, filename, blkno, |
| segmentBlockOffset, segmentSize, |
| segmentNumber, blkno, segmentBlockOffset + blkno, |
| checksum, phdr->pd_checksum, |
| phdr->pd_flags, phdr->pd_lower, phdr->pd_upper, |
| phdr->pd_special, phdr->pd_pagesize_version, |
| phdr->pd_prune_xid ); |
| } |
| |
| if (phdr->pd_checksum != 0 && phdr->pd_checksum != checksum) |
| { |
| corrupted = true; |
| if (verbose) |
| printf("ERROR: corruption found in %s/%s[%d], expected %x, found %x\n", |
| dirpath, filename, blkno, checksum, phdr->pd_checksum); |
| } |
| |
| if (verbose) |
| printf("DEBUG: is_page_corrupted for %s/%s[%d] returns: %d\n", |
| dirpath, filename, blkno, corrupted); |
| |
| return corrupted; |
| } |
| |
| static uint32 |
| scan_segmentfile(const char *filename, const char *dirpath) |
| { |
| |
| /* Performance considerations: |
| * segment files can be up to 1GB in size before they are split |
| * https://www.postgresql.org/docs/9.6/static/storage-file-layout.html |
| */ |
| |
| /* Always skip checking pg_internal.init because always shows as |
| * corrupted. If this file ever becomes corrupted, OK to remove |
| * it as it is recreated upon server startup. Return false since |
| * zero corrupt pages are checked here. |
| */ |
| if (strstr(filename, "pg_internal.init") != NULL) |
| return 0; |
| |
| if (verbose) |
| printf("DEBUG: scanning segment filename: %s/%s\n", |
| dirpath, filename); |
| |
| int fd; |
| char page[BLCKSZ]; |
| BlockNumber blkno = 0; |
| BlockNumber corrupted = 0; |
| |
| if (parallel) |
| { |
| char full_filepath[MAX_DIR_LENGTH]; |
| sprintf(full_filepath, "%s/%s", dirpath, filename); |
| fd = open(full_filepath, O_RDONLY); |
| } else |
| { |
| fd = open(filename, O_RDONLY); |
| } |
| |
| if (fd < 0) |
| { |
| fprintf(stderr, "ERROR: %s: %s cannot be opened\n", strerror(errno), |
| filename); |
| /* return 1 so that other segment files can be scanned, but that this |
| * segment file is marked as corrupted/some unknown error |
| */ |
| return 1; |
| } |
| |
| while (read(fd, page, BLCKSZ) == BLCKSZ) |
| { |
| if (is_page_corrupted(page, blkno, filename, dirpath)) |
| { |
| corrupted++; |
| } |
| blkno++; |
| } |
| close(fd); |
| |
| return corrupted; |
| } |
| |
| // Counts the number of entries in the given directory. We use this to count |
| // the number of elements in /pgsql/data/base, which contains only directories. |
| static void count_entries(const char *dirpath) { |
| DIR *d; |
| struct dirent *dir; |
| |
| d = opendir(dirpath); |
| |
| if (d) |
| { |
| while ((dir = readdir(d)) != NULL) |
| { |
| if (strcmp(".", dir->d_name) == 0 || |
| strcmp("..", dir->d_name) == 0) |
| continue; |
| num_directories++; |
| } |
| } |
| closedir(d); |
| } |
| |
| // Builds a list of data directories to scan. |
| static char** get_data_directories(const char *dirpath) { |
| DIR *d; |
| struct dirent *dir; |
| struct stat statbuf; |
| char **dirs = malloc(sizeof(char *) * num_directories); |
| int index = 0; |
| |
| d = opendir(dirpath); |
| |
| if (d) |
| { |
| while ((dir = readdir(d)) != NULL) |
| { |
| if (strcmp(".", dir->d_name) == 0 || |
| strcmp("..", dir->d_name) == 0) |
| continue; |
| |
| char full_dirpath[MAX_DIR_LENGTH]; |
| sprintf(full_dirpath, "%s/%s", dirpath, dir->d_name); |
| |
| // Try using d_type to verify it's a directory. If we can't, |
| // fall back to using lstat. |
| if (dir->d_type == DT_DIR) |
| { |
| dirs[index] = strdup(full_dirpath); |
| index++; |
| } else |
| { |
| int lstat_res = lstat(full_dirpath, &statbuf); |
| if (lstat_res < 0) |
| { |
| // TODO: Currently, when the file structure |
| // isn't what we expect, we log an error and keep going. |
| // We do not exit(1) from the script because then it will be |
| // reported as a checksum failure, and we will not be able |
| // to differentiate between failures due to the file |
| // structure not being what this script expects, and |
| // actual checksum failures. In theory, this should never |
| // happen, but just in case in the future Postgres changes |
| // the way they structure their data files, we should have |
| // a way to differentiate between the two cases, and exit |
| // here. |
| fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n", |
| full_dirpath, strerror(errno)); |
| } |
| |
| if (S_ISDIR(statbuf.st_mode)) |
| { |
| dirs[index] = strdup(full_dirpath); |
| index++; |
| } else |
| { |
| // TODO: Same as the above comment; we should |
| // figure out a way to identify unexpected file structure. |
| fprintf(stderr, "ERROR: unexpected file strucuture; " |
| "expected %s to be a directory, but " |
| "statbuf.st_mode was %d. Only files are expected " |
| "under data directories.\n", |
| full_dirpath, statbuf.st_mode); |
| } |
| } |
| } |
| } |
| closedir(d); |
| return dirs; |
| } |
| |
| // Scans a directory within /pgsql/data/base (Ex. /pgsql/data/base/12345). |
| // The elements inside this directory should all be data files. |
| // This method should be called via a new thread so that we can scan directories |
| // in parallel. |
| int scan_data_files(const char *dirpath) { |
| DIR *d; |
| struct dirent *dirent; |
| struct stat statbuf; |
| int corrupt_pages_found = 0; |
| |
| d = opendir(dirpath); |
| |
| if (d) |
| { |
| while ((dirent = readdir(d)) != NULL) |
| { |
| if (strstr(dirent->d_name, "pg_internal.init") != NULL || |
| strcmp(".", dirent->d_name) == 0 || |
| strcmp("..", dirent->d_name) == 0) |
| continue; |
| |
| // Try using d_type to verify it's a file. If we can't, fall back |
| // to using lstat. |
| if (dirent->d_type == DT_REG) |
| { |
| corrupt_pages_found += scan_segmentfile(dirent->d_name, |
| dirpath); |
| } else |
| { |
| char full_dirpath[MAX_DIR_LENGTH]; |
| sprintf(full_dirpath, "%s/%s", dirpath, dirent->d_name); |
| |
| int lstat_res = lstat(full_dirpath, &statbuf); |
| if (lstat_res < 0) |
| { |
| // TODO: Currently, when the file structure |
| // isn't what we expect, we log an error and keep going. |
| // We do not exit(1) from the script because then it will be |
| // reported as a checksum failure, and we will not be able |
| // to differentiate between failures due to the file |
| // structure not being what this script expects, and |
| // actual checksum failures. In theory, this should never |
| // happen, but just in case in the future Postgres changes |
| // the way they structure their data files, we should have |
| // a way to differentiate between the two cases, and exit |
| // here. |
| fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n", |
| full_dirpath, strerror(errno)); |
| } |
| |
| if (S_ISREG(statbuf.st_mode)) |
| { |
| corrupt_pages_found += |
| scan_segmentfile(dirent->d_name, dirpath); |
| } else |
| { |
| // TODO: Same as the above comment; we should |
| // figure out a way to identify unexpected file structure. |
| fprintf(stderr, "ERROR: unexpected file strucuture; " |
| "expected %s to be a file, but statbuf.st_mode was " |
| "%d. Only files are expected under data " |
| "directories.\n", |
| full_dirpath, statbuf.st_mode); |
| } |
| } |
| } |
| } |
| |
| closedir(d); |
| return corrupt_pages_found; |
| } |
| |
| // Gets the current queue index to process the data directory at that index, |
| // and then increments the index. |
| static int get_queue_index(int *index) { |
| pthread_mutex_lock(&mtx); |
| *index = queue_index; |
| queue_index++; |
| pthread_mutex_unlock(&mtx); |
| return *index; |
| } |
| |
| // Function used by each thread. It will continously look for the next |
| // data directory to scan from the queue, scan it, then repeat until |
| // the queue has been processed. |
| static uint32 process_directories_worker(void *dirs) { |
| const char **data_directories = (const char **)dirs; |
| uint32 corrupt_pages_found = 0; |
| |
| int index = 0; |
| while (get_queue_index(&index) < num_directories) { |
| corrupt_pages_found += scan_data_files(data_directories[index]); |
| } |
| |
| return corrupt_pages_found; |
| } |
| |
| // Builds the queue of data directories, then spins up threads to |
| // process the queue. |
| static uint32 scan_base_directory(const char *dirpath) { |
| uint32 corrupt_pages_found = 0; |
| |
| count_entries(dirpath); |
| char **dirs = get_data_directories(dirpath); |
| |
| // Start up the workers. |
| pthread_t threads[MAX_THREAD_COUNT]; |
| for (int i = 0; i < MAX_THREAD_COUNT; i++) { |
| pthread_create(threads + i, NULL, (void *)&process_directories_worker, |
| (void *)dirs); |
| } |
| |
| // Wait for all threads to finish. |
| for (int i = 0; i < MAX_THREAD_COUNT; i++) { |
| int corrupt_pages; |
| pthread_join(threads[i], (void **) &corrupt_pages); |
| corrupt_pages_found += corrupt_pages; |
| } |
| |
| // Free everything. |
| for (int i = 0; i < num_directories; i++) { |
| free(dirs[i]); |
| } |
| free(dirs); |
| |
| return corrupt_pages_found; |
| } |
| |
| static uint32 |
| scan_directory(const char *dirpath) |
| { |
| /* scope is only to check the base directory, where the actual data files |
| * are located. Other directories contain temporary files used for |
| * transactions or queries in progress and should not be checked. |
| * |
| * Postgres stores data files in one directory per database defined, |
| * without additional nesting or leafs. This causes depth of database |
| * directories to always be one. |
| */ |
| |
| DIR *d; |
| struct dirent *dir; |
| struct stat statbuf; |
| uint32 corrupt_pages_found = 0; |
| |
| d = opendir(dirpath); |
| |
| if (verbose) |
| printf("DEBUG: called scan_directory(%s)\n", dirpath); |
| |
| if (d) |
| { |
| chdir(dirpath); |
| while ((dir = readdir(d)) != NULL) |
| { |
| lstat(dir->d_name, &statbuf); |
| |
| if (verbose) |
| printf("DEBUG: direntry: %s/%s - statbuf.st_mode: %d\n", |
| dirpath, dir->d_name, statbuf.st_mode); |
| |
| if (S_ISDIR(statbuf.st_mode)) |
| { |
| if(strcmp(".", dir->d_name) == 0 || |
| strcmp("..", dir->d_name) == 0) |
| continue; |
| |
| char new_dirpath[MAX_DIR_LENGTH]; |
| sprintf(new_dirpath, "%s/%s", dirpath, dir->d_name); |
| |
| corrupt_pages_found += scan_directory(new_dirpath); |
| } |
| else if (S_ISREG(statbuf.st_mode)) |
| { |
| corrupt_pages_found += scan_segmentfile(dir->d_name, dirpath); |
| } |
| } |
| closedir(d); |
| } |
| |
| return corrupt_pages_found; |
| } |
| |
| static void |
| print_help(const char *argv_value) |
| { |
| printf("Usage: %s [OPTIONS]\n", argv_value); |
| printf(" -v verbose\n"); |
| printf(" -D directory data directory\n"); |
| printf(" -h, --help print this help and exit\n"); |
| printf(" -p scans data directories in parallel\n"); |
| printf("\n"); |
| } |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| /* |
| * Flow is to use a data directory and traverse to find segments, each |
| * segment file is then scanned for corrupt pages. |
| */ |
| |
| int c; |
| uint32 corrupted_pages_found = 0; |
| const char *short_opt = "chD:vp"; |
| char datadir[MAX_DIR_LENGTH]; |
| struct stat statbuf; |
| struct option long_opt[] = |
| { |
| {"dumpcorrupted", no_argument, NULL, 'c'}, |
| {"datadir", required_argument, NULL, 'D'}, |
| {"help", no_argument, NULL, 'h'}, |
| {"verbose", no_argument, NULL, 'v'}, |
| {"parallel", no_argument, NULL, 'p'}, |
| {NULL, 0, NULL, 0 } |
| }; |
| |
| /* if no arguments passed, print help and exit(1) */ |
| if (argc == 1) |
| { |
| print_help(argv[0]); |
| exit(1); |
| } |
| |
| while((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) |
| { |
| switch(c) |
| { |
| case -1: /* no more arguments */ |
| case 0: /* long options toggles */ |
| break; |
| |
| case 'c': |
| dump_corrupted = 1; |
| break; |
| |
| case 'v': |
| verbose = 1; |
| break; |
| |
| case 'p': |
| parallel = true; |
| break; |
| |
| case 'D': |
| if (optarg) |
| { |
| /* '-D' argument of data directory must contain 'base' |
| * directory to scan, check if trailing back slash and |
| * append appropriately. Must use base since other |
| * directories, such as pg_xlog, contain currently |
| * unsupported file types. |
| */ |
| strcpy(datadir, optarg); |
| if (strcmp(&optarg[strlen(optarg) - 1], "/") == 0) |
| { |
| strcat(datadir, "base"); |
| } |
| else |
| { |
| strcat(datadir, "/base"); |
| } |
| } |
| else |
| { |
| fprintf(stderr, "ERROR: -D argument could not be parsed\n"); |
| exit(1); |
| } |
| break; |
| |
| case 'h': |
| print_help(argv[0]); |
| exit(1); |
| |
| default: |
| fprintf(stderr, "ERROR: %s: invalid option -- %c\n", argv[0], c); |
| print_help(argv[0]); |
| exit(1); |
| }; |
| }; |
| |
| lstat(datadir, &statbuf); |
| if (!S_ISDIR(statbuf.st_mode)) |
| { |
| fprintf(stderr, "ERROR: base %s is not a directory\n", datadir); |
| exit(1); |
| } |
| |
| if (parallel) |
| { |
| corrupted_pages_found = scan_base_directory(datadir); |
| } else { |
| corrupted_pages_found = scan_directory(datadir); |
| } |
| |
| if (corrupted_pages_found > 0) |
| { |
| printf("CORRUPTION FOUND: %d\n", corrupted_pages_found); |
| exit(1); |
| } |
| else |
| { |
| printf("NO CORRUPTION FOUND\n"); |
| exit(0); |
| } |
| } |