blob: 35a803e8a902bf1a6e7e7233f8e9723a0b1b8ec1 [file] [log] [blame]
/*
* Utility to verify a postgres base directory.
*
* This utility scans the 'base' data directory of a postgres instance to find
* directories and segment files. Each segment file is then checked for any
* pages that may have invalid checksums. Invalid checksums are determined by
* comparing the stored checksum against the current checksum.
*
* In order to use this utility, it is necessary to enable checksums with
* initdb. The flag for initdb is either -k or --data-checksums. Without,
* enabling checksums, it is not possible to use this utility.
*
*/
/*
* TODO:
* - improve diagnostics and/or repair if corrupted pages found
* - create test that simulates/creates corruption
* - check directory permissions since sudo may be required
* - handle exceptions with setjmp or signals
* - aggregate and/or create a report of corrupted blocks
* - implement dump of corrupted pages ( hexdump? )
* - run on multiple segment files simultaneously
* - detect if checksums enabled; regex for unset checksums?
* - throttling mechanism based on diskio/memory/cpu
* - convert this to an extension?
* - implement page dumping if checksum mismatch
*/
/* standard header files */
#include <errno.h>
#include <getopt.h>
#include <linux/limits.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
/* headers for directory scanning */
#include <dirent.h>
#include <stddef.h>
#include <string.h>
#include <sys/stat.h>
/* postgres specific header files */
#include "c.h"
#include "pg_config.h"
#include "postgres.h"
#include "storage/checksum_impl.h"
#define MAX_DIR_LENGTH 256
#define MAX_THREAD_COUNT 10
/* global flag values */
int verbose = 0;
int dump_corrupted = 0;
bool parallel = false;
int num_directories = 0;
// Variables used to control the multi-threading.
volatile int queue_index = 0;
pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
// SPECKLE_POSTGRES: Add a dummy function for builds with postgres assertions
// enabled. The useful definition is at cs/src/backend/utils/error/assert.c
// but this is not currently exported outside the postgres tree.
void ExceptionalCondition(const char *conditionName, const char *errorType,
const char *fileName, int lineNumber) {
abort();
}
/*
* Determine segment number by segment file name. For instance, if file
* name is /path/to/xxxx.7 procedure returns 7. Default return value is 0.
*/
static unsigned int
get_segment_number(const char* fileName)
{
int segnumOffset = strlen(fileName) - 1;
if(segnumOffset < 0)
return 0;
while(isdigit(fileName[segnumOffset])) {
segnumOffset--;
if(segnumOffset < 0)
return 0;
}
if(fileName[segnumOffset] != '.')
return 0;
return atoi(&fileName[segnumOffset+1]);
}
static bool
is_page_corrupted(const char *page, BlockNumber blkno, const char *filename,
const char *dirpath)
{
/* Function checks a page header checksum value aginst the current
* checksum value of a page. NewPage checksums will be zero until they
* are set. There is a similar function PageIsVerified responsible for
* checking pages before they are loaded into buffer pool.
* see: src/backend/storage/page/bufpage.c
*
* Consider returning a negative value if page is new or checksum unset
* or if more detail for a page verifiction can be found.
*/
PageHeader phdr = (PageHeader)page;
/* calculating blkno needs to be aboslute so that subsequent segment files
* have the blkno calculated based on all segment files and not relative to
* the current segment file.
* see: https://git.postgresql.org/gitweb/?p=pg_filedump.git;a=commitdiff;h=052ed0112967dd1e9b0e2cbe54821c04475f1a3a;hp=b163cdaa53b651958cc8
*/
/* Segment size in bytes, BLCKSZ is 8192 by default, 8KB pages
* 1GB segment files are 131072 blocks of 8KB page size
* NOTE: pd_pagesize_version is BLCKSZ + version, since 8.3+, version is 4,
* resulting in pd_pagesize_version being 8196 when pagesize is 8KB
*/
static unsigned int segmentSize = RELSEG_SIZE * BLCKSZ;
/* Number of current segment */
static unsigned int segmentNumber = 0;
if (parallel)
{
char full_filepath[MAX_DIR_LENGTH];
sprintf(full_filepath, "%s/%s", dirpath, filename);
segmentNumber = get_segment_number(full_filepath);
} else
{
segmentNumber = get_segment_number(filename);
}
/* segmentBlockOffset is the absolute blockNumber of the block when taking
* into account any previous segment files.
*/
uint32 segmentBlockOffset = RELSEG_SIZE * segmentNumber;
uint16 checksum = pg_checksum_page((char *)page, segmentBlockOffset + blkno);
bool corrupted = false;
if (verbose)
{
printf("DEBUG: filename: %s/%s[%d]\n \
\tsegmentBlockOffset: %d, maxSegmentSize: %d,\n \
\tsegmentNumber: %d, relative blkno: %d, absolute blkno: %d,\n \
\tchecksum: %x, phdr->pd_checksum: %x,\n \
\tphdr->pd_flags: %d, phdr->pd_lower: %d, phdr->pd_upper: %d,\n \
\tphdr->pd_special: %d, phdr->pd_pagesize_version: %d,\n \
\tphdr->pd_prune_xid: %d\n",
dirpath, filename, blkno,
segmentBlockOffset, segmentSize,
segmentNumber, blkno, segmentBlockOffset + blkno,
checksum, phdr->pd_checksum,
phdr->pd_flags, phdr->pd_lower, phdr->pd_upper,
phdr->pd_special, phdr->pd_pagesize_version,
phdr->pd_prune_xid );
}
if (phdr->pd_checksum != 0 && phdr->pd_checksum != checksum)
{
corrupted = true;
if (verbose)
printf("ERROR: corruption found in %s/%s[%d], expected %x, found %x\n",
dirpath, filename, blkno, checksum, phdr->pd_checksum);
}
if (verbose)
printf("DEBUG: is_page_corrupted for %s/%s[%d] returns: %d\n",
dirpath, filename, blkno, corrupted);
return corrupted;
}
static uint32
scan_segmentfile(const char *filename, const char *dirpath)
{
/* Performance considerations:
* segment files can be up to 1GB in size before they are split
* https://www.postgresql.org/docs/9.6/static/storage-file-layout.html
*/
/* Always skip checking pg_internal.init because always shows as
* corrupted. If this file ever becomes corrupted, OK to remove
* it as it is recreated upon server startup. Return false since
* zero corrupt pages are checked here.
*/
if (strstr(filename, "pg_internal.init") != NULL)
return 0;
if (verbose)
printf("DEBUG: scanning segment filename: %s/%s\n",
dirpath, filename);
int fd;
char page[BLCKSZ];
BlockNumber blkno = 0;
BlockNumber corrupted = 0;
if (parallel)
{
char full_filepath[MAX_DIR_LENGTH];
sprintf(full_filepath, "%s/%s", dirpath, filename);
fd = open(full_filepath, O_RDONLY);
} else
{
fd = open(filename, O_RDONLY);
}
if (fd < 0)
{
fprintf(stderr, "ERROR: %s: %s cannot be opened\n", strerror(errno),
filename);
/* return 1 so that other segment files can be scanned, but that this
* segment file is marked as corrupted/some unknown error
*/
return 1;
}
while (read(fd, page, BLCKSZ) == BLCKSZ)
{
if (is_page_corrupted(page, blkno, filename, dirpath))
{
corrupted++;
}
blkno++;
}
close(fd);
return corrupted;
}
// Counts the number of entries in the given directory. We use this to count
// the number of elements in /pgsql/data/base, which contains only directories.
static void count_entries(const char *dirpath) {
DIR *d;
struct dirent *dir;
d = opendir(dirpath);
if (d)
{
while ((dir = readdir(d)) != NULL)
{
if (strcmp(".", dir->d_name) == 0 ||
strcmp("..", dir->d_name) == 0)
continue;
num_directories++;
}
}
closedir(d);
}
// Builds a list of data directories to scan.
static char** get_data_directories(const char *dirpath) {
DIR *d;
struct dirent *dir;
struct stat statbuf;
char **dirs = malloc(sizeof(char *) * num_directories);
int index = 0;
d = opendir(dirpath);
if (d)
{
while ((dir = readdir(d)) != NULL)
{
if (strcmp(".", dir->d_name) == 0 ||
strcmp("..", dir->d_name) == 0)
continue;
char full_dirpath[MAX_DIR_LENGTH];
sprintf(full_dirpath, "%s/%s", dirpath, dir->d_name);
// Try using d_type to verify it's a directory. If we can't,
// fall back to using lstat.
if (dir->d_type == DT_DIR)
{
dirs[index] = strdup(full_dirpath);
index++;
} else
{
int lstat_res = lstat(full_dirpath, &statbuf);
if (lstat_res < 0)
{
// TODO: Currently, when the file structure
// isn't what we expect, we log an error and keep going.
// We do not exit(1) from the script because then it will be
// reported as a checksum failure, and we will not be able
// to differentiate between failures due to the file
// structure not being what this script expects, and
// actual checksum failures. In theory, this should never
// happen, but just in case in the future Postgres changes
// the way they structure their data files, we should have
// a way to differentiate between the two cases, and exit
// here.
fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n",
full_dirpath, strerror(errno));
}
if (S_ISDIR(statbuf.st_mode))
{
dirs[index] = strdup(full_dirpath);
index++;
} else
{
// TODO: Same as the above comment; we should
// figure out a way to identify unexpected file structure.
fprintf(stderr, "ERROR: unexpected file strucuture; "
"expected %s to be a directory, but "
"statbuf.st_mode was %d. Only files are expected "
"under data directories.\n",
full_dirpath, statbuf.st_mode);
}
}
}
}
closedir(d);
return dirs;
}
// Scans a directory within /pgsql/data/base (Ex. /pgsql/data/base/12345).
// The elements inside this directory should all be data files.
// This method should be called via a new thread so that we can scan directories
// in parallel.
int scan_data_files(const char *dirpath) {
DIR *d;
struct dirent *dirent;
struct stat statbuf;
int corrupt_pages_found = 0;
d = opendir(dirpath);
if (d)
{
while ((dirent = readdir(d)) != NULL)
{
if (strstr(dirent->d_name, "pg_internal.init") != NULL ||
strcmp(".", dirent->d_name) == 0 ||
strcmp("..", dirent->d_name) == 0)
continue;
// Try using d_type to verify it's a file. If we can't, fall back
// to using lstat.
if (dirent->d_type == DT_REG)
{
corrupt_pages_found += scan_segmentfile(dirent->d_name,
dirpath);
} else
{
char full_dirpath[MAX_DIR_LENGTH];
sprintf(full_dirpath, "%s/%s", dirpath, dirent->d_name);
int lstat_res = lstat(full_dirpath, &statbuf);
if (lstat_res < 0)
{
// TODO: Currently, when the file structure
// isn't what we expect, we log an error and keep going.
// We do not exit(1) from the script because then it will be
// reported as a checksum failure, and we will not be able
// to differentiate between failures due to the file
// structure not being what this script expects, and
// actual checksum failures. In theory, this should never
// happen, but just in case in the future Postgres changes
// the way they structure their data files, we should have
// a way to differentiate between the two cases, and exit
// here.
fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n",
full_dirpath, strerror(errno));
}
if (S_ISREG(statbuf.st_mode))
{
corrupt_pages_found +=
scan_segmentfile(dirent->d_name, dirpath);
} else
{
// TODO: Same as the above comment; we should
// figure out a way to identify unexpected file structure.
fprintf(stderr, "ERROR: unexpected file strucuture; "
"expected %s to be a file, but statbuf.st_mode was "
"%d. Only files are expected under data "
"directories.\n",
full_dirpath, statbuf.st_mode);
}
}
}
}
closedir(d);
return corrupt_pages_found;
}
// Gets the current queue index to process the data directory at that index,
// and then increments the index.
static int get_queue_index(int *index) {
pthread_mutex_lock(&mtx);
*index = queue_index;
queue_index++;
pthread_mutex_unlock(&mtx);
return *index;
}
// Function used by each thread. It will continously look for the next
// data directory to scan from the queue, scan it, then repeat until
// the queue has been processed.
static uint32 process_directories_worker(void *dirs) {
const char **data_directories = (const char **)dirs;
uint32 corrupt_pages_found = 0;
int index = 0;
while (get_queue_index(&index) < num_directories) {
corrupt_pages_found += scan_data_files(data_directories[index]);
}
return corrupt_pages_found;
}
// Builds the queue of data directories, then spins up threads to
// process the queue.
static uint32 scan_base_directory(const char *dirpath) {
uint32 corrupt_pages_found = 0;
count_entries(dirpath);
char **dirs = get_data_directories(dirpath);
// Start up the workers.
pthread_t threads[MAX_THREAD_COUNT];
for (int i = 0; i < MAX_THREAD_COUNT; i++) {
pthread_create(threads + i, NULL, (void *)&process_directories_worker,
(void *)dirs);
}
// Wait for all threads to finish.
for (int i = 0; i < MAX_THREAD_COUNT; i++) {
int corrupt_pages;
pthread_join(threads[i], (void **) &corrupt_pages);
corrupt_pages_found += corrupt_pages;
}
// Free everything.
for (int i = 0; i < num_directories; i++) {
free(dirs[i]);
}
free(dirs);
return corrupt_pages_found;
}
static uint32
scan_directory(const char *dirpath)
{
/* scope is only to check the base directory, where the actual data files
* are located. Other directories contain temporary files used for
* transactions or queries in progress and should not be checked.
*
* Postgres stores data files in one directory per database defined,
* without additional nesting or leafs. This causes depth of database
* directories to always be one.
*/
DIR *d;
struct dirent *dir;
struct stat statbuf;
uint32 corrupt_pages_found = 0;
d = opendir(dirpath);
if (verbose)
printf("DEBUG: called scan_directory(%s)\n", dirpath);
if (d)
{
chdir(dirpath);
while ((dir = readdir(d)) != NULL)
{
lstat(dir->d_name, &statbuf);
if (verbose)
printf("DEBUG: direntry: %s/%s - statbuf.st_mode: %d\n",
dirpath, dir->d_name, statbuf.st_mode);
if (S_ISDIR(statbuf.st_mode))
{
if(strcmp(".", dir->d_name) == 0 ||
strcmp("..", dir->d_name) == 0)
continue;
char new_dirpath[MAX_DIR_LENGTH];
sprintf(new_dirpath, "%s/%s", dirpath, dir->d_name);
corrupt_pages_found += scan_directory(new_dirpath);
}
else if (S_ISREG(statbuf.st_mode))
{
corrupt_pages_found += scan_segmentfile(dir->d_name, dirpath);
}
}
closedir(d);
}
return corrupt_pages_found;
}
static void
print_help(const char *argv_value)
{
printf("Usage: %s [OPTIONS]\n", argv_value);
printf(" -v verbose\n");
printf(" -D directory data directory\n");
printf(" -h, --help print this help and exit\n");
printf(" -p scans data directories in parallel\n");
printf("\n");
}
int
main(int argc, char *argv[])
{
/*
* Flow is to use a data directory and traverse to find segments, each
* segment file is then scanned for corrupt pages.
*/
int c;
uint32 corrupted_pages_found = 0;
const char *short_opt = "chD:vp";
char datadir[MAX_DIR_LENGTH];
struct stat statbuf;
struct option long_opt[] =
{
{"dumpcorrupted", no_argument, NULL, 'c'},
{"datadir", required_argument, NULL, 'D'},
{"help", no_argument, NULL, 'h'},
{"verbose", no_argument, NULL, 'v'},
{"parallel", no_argument, NULL, 'p'},
{NULL, 0, NULL, 0 }
};
/* if no arguments passed, print help and exit(1) */
if (argc == 1)
{
print_help(argv[0]);
exit(1);
}
while((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1)
{
switch(c)
{
case -1: /* no more arguments */
case 0: /* long options toggles */
break;
case 'c':
dump_corrupted = 1;
break;
case 'v':
verbose = 1;
break;
case 'p':
parallel = true;
break;
case 'D':
if (optarg)
{
/* '-D' argument of data directory must contain 'base'
* directory to scan, check if trailing back slash and
* append appropriately. Must use base since other
* directories, such as pg_xlog, contain currently
* unsupported file types.
*/
strcpy(datadir, optarg);
if (strcmp(&optarg[strlen(optarg) - 1], "/") == 0)
{
strcat(datadir, "base");
}
else
{
strcat(datadir, "/base");
}
}
else
{
fprintf(stderr, "ERROR: -D argument could not be parsed\n");
exit(1);
}
break;
case 'h':
print_help(argv[0]);
exit(1);
default:
fprintf(stderr, "ERROR: %s: invalid option -- %c\n", argv[0], c);
print_help(argv[0]);
exit(1);
};
};
lstat(datadir, &statbuf);
if (!S_ISDIR(statbuf.st_mode))
{
fprintf(stderr, "ERROR: base %s is not a directory\n", datadir);
exit(1);
}
if (parallel)
{
corrupted_pages_found = scan_base_directory(datadir);
} else {
corrupted_pages_found = scan_directory(datadir);
}
if (corrupted_pages_found > 0)
{
printf("CORRUPTION FOUND: %d\n", corrupted_pages_found);
exit(1);
}
else
{
printf("NO CORRUPTION FOUND\n");
exit(0);
}
}