/*
 * Utility to verify a postgres base directory.
 *
 * This utility scans the 'base' data directory of a postgres instance to find
 * directories and segment files.  Each segment file is then checked for any
 * pages that may have invalid checksums.  Invalid checksums are determined by
 * comparing the stored checksum against the current checksum.
 *
 * In order to use this utility, it is necessary to enable checksums with
 * initdb.  The flag for initdb is either -k or --data-checksums.  Without,
 * enabling checksums, it is not possible to use this utility.
 *
 */

/*
 * TODO:
 *      - improve diagnostics and/or repair if corrupted pages found
 *      - create test that simulates/creates corruption
 *      - check directory permissions since sudo may be required
 *      - handle exceptions with setjmp or signals
 *      - aggregate and/or create a report of corrupted blocks
 *      - implement dump of corrupted pages ( hexdump? )
 *      - run on multiple segment files simultaneously
 *      - detect if checksums enabled; regex for unset checksums?
 *      - throttling mechanism based on diskio/memory/cpu
 *      - convert this to an extension?
 *      - implement page dumping if checksum mismatch
 */

/* standard header files */
#include <errno.h>
#include <getopt.h>
#include <linux/limits.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

/* headers for directory scanning */
#include <dirent.h>
#include <stddef.h>
#include <string.h>
#include <sys/stat.h>

/* postgres specific header files */
#include "c.h"
#include "pg_config.h"
#include "postgres.h"
#include "storage/checksum_impl.h"

#define MAX_DIR_LENGTH 256
#define MAX_THREAD_COUNT 10

/* global flag values */
int verbose = 0;
int dump_corrupted = 0;
bool parallel = false;
int num_directories = 0;

// Variables used to control the multi-threading.
volatile int queue_index = 0;
pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;

// SPECKLE_POSTGRES: Add a dummy function for builds with postgres assertions
// enabled.  The useful definition is at cs/src/backend/utils/error/assert.c
// but this is not currently exported outside the postgres tree.
void ExceptionalCondition(const char *conditionName, const char *errorType,
                          const char *fileName, int lineNumber) {
  abort();
}

/*
* Determine segment number by segment file name. For instance, if file
* name is /path/to/xxxx.7 procedure returns 7. Default return value is 0.
*/
static unsigned int
get_segment_number(const char* fileName)
{
    int segnumOffset = strlen(fileName) - 1;

    if(segnumOffset < 0)
           return 0;

    while(isdigit(fileName[segnumOffset])) {
        segnumOffset--;
        if(segnumOffset < 0)
            return 0;
    }

    if(fileName[segnumOffset] != '.')
        return 0;

    return atoi(&fileName[segnumOffset+1]);
}

static bool
is_page_corrupted(const char *page, BlockNumber blkno, const char *filename,
    const char *dirpath)
{
    /* Function checks a page header checksum value aginst the current
     * checksum value of a page.  NewPage checksums will be zero until they
     * are set.  There is a similar function PageIsVerified responsible for
     * checking pages before they are loaded into buffer pool.
     *  see:  src/backend/storage/page/bufpage.c
     *
     * Consider returning a negative value if page is new or checksum unset
     * or if more detail for a page verifiction can be found.
     */

    PageHeader phdr = (PageHeader)page;

    /* calculating blkno needs to be aboslute so that subsequent segment files
    *  have the blkno calculated based on all segment files and not relative to
    *  the current segment file.
    *  see: https://git.postgresql.org/gitweb/?p=pg_filedump.git;a=commitdiff;h=052ed0112967dd1e9b0e2cbe54821c04475f1a3a;hp=b163cdaa53b651958cc8
    */

    /* Segment size in bytes, BLCKSZ is 8192 by default, 8KB pages
     *  1GB segment files are 131072 blocks of 8KB page size
     *  NOTE: pd_pagesize_version is BLCKSZ + version, since 8.3+, version is 4,
     *   resulting in pd_pagesize_version being 8196 when pagesize is 8KB
     */
    static unsigned int segmentSize = RELSEG_SIZE * BLCKSZ;

    /* Number of current segment */
    static unsigned int segmentNumber = 0;

    if (parallel)
    {
        char full_filepath[MAX_DIR_LENGTH];
        sprintf(full_filepath, "%s/%s", dirpath, filename);
        segmentNumber = get_segment_number(full_filepath);
    } else
    {
        segmentNumber = get_segment_number(filename);
    }

    /* segmentBlockOffset is the absolute blockNumber of the block when taking
     * into account any previous segment files.
    */
    uint32 segmentBlockOffset = RELSEG_SIZE * segmentNumber;

    uint16 checksum = pg_checksum_page((char *)page, segmentBlockOffset + blkno);

    bool corrupted = false;

    if (verbose)
    {
        printf("DEBUG: filename: %s/%s[%d]\n \
            \tsegmentBlockOffset: %d, maxSegmentSize: %d,\n \
            \tsegmentNumber: %d, relative blkno: %d, absolute blkno: %d,\n \
            \tchecksum: %x, phdr->pd_checksum: %x,\n \
            \tphdr->pd_flags: %d, phdr->pd_lower: %d, phdr->pd_upper: %d,\n \
            \tphdr->pd_special: %d, phdr->pd_pagesize_version: %d,\n \
            \tphdr->pd_prune_xid: %d\n",
            dirpath, filename, blkno,
            segmentBlockOffset, segmentSize,
            segmentNumber, blkno, segmentBlockOffset + blkno,
            checksum, phdr->pd_checksum,
            phdr->pd_flags, phdr->pd_lower, phdr->pd_upper,
            phdr->pd_special, phdr->pd_pagesize_version,
            phdr->pd_prune_xid );
    }

    if (phdr->pd_checksum != 0 && phdr->pd_checksum != checksum)
    {
        corrupted = true;
        if (verbose)
            printf("ERROR: corruption found in %s/%s[%d], expected %x, found %x\n",
                dirpath, filename, blkno, checksum, phdr->pd_checksum);
    }

    if (verbose)
        printf("DEBUG: is_page_corrupted for %s/%s[%d] returns: %d\n",
                dirpath, filename, blkno, corrupted);

    return corrupted;
}

static uint32
scan_segmentfile(const char *filename, const char *dirpath)
{

    /* Performance considerations:
     * segment files can be up to 1GB in size before they are split
     * https://www.postgresql.org/docs/9.6/static/storage-file-layout.html
     */

    /* Always skip checking pg_internal.init because always shows as
     * corrupted.  If this file ever becomes corrupted, OK to remove
     * it as it is recreated upon server startup. Return false since
     * zero corrupt pages are checked here.
     */
    if (strstr(filename, "pg_internal.init") != NULL)
        return 0;

    if (verbose)
        printf("DEBUG: scanning segment filename: %s/%s\n",
            dirpath, filename);

    int fd;
    char page[BLCKSZ];
    BlockNumber blkno = 0;
    BlockNumber corrupted = 0;

    if (parallel)
    {
        char full_filepath[MAX_DIR_LENGTH];
        sprintf(full_filepath, "%s/%s", dirpath, filename);
        fd = open(full_filepath, O_RDONLY);
    } else
    {
        fd = open(filename, O_RDONLY);
    }

    if (fd < 0)
    {
        fprintf(stderr, "ERROR: %s: %s cannot be opened\n", strerror(errno),
                filename);
        /* return 1 so that other segment files can be scanned, but that this
         * segment file is marked as corrupted/some unknown error
         */
        return 1;
    }

    while (read(fd, page, BLCKSZ) == BLCKSZ)
    {
        if (is_page_corrupted(page, blkno, filename, dirpath))
        {
            corrupted++;
        }
        blkno++;
    }
    close(fd);

    return corrupted;
}

// Counts the number of entries in the given directory. We use this to count
// the number of elements in /pgsql/data/base, which contains only directories.
static void count_entries(const char *dirpath) {
    DIR *d;
    struct dirent *dir;

    d = opendir(dirpath);

    if (d)
    {
        while ((dir = readdir(d)) != NULL)
        {
            if (strcmp(".", dir->d_name) == 0 ||
                strcmp("..", dir->d_name) == 0)
                continue;
            num_directories++;
        }
    }
    closedir(d);
}

// Builds a list of data directories to scan.
static char** get_data_directories(const char *dirpath) {
    DIR *d;
    struct dirent *dir;
    struct stat statbuf;
    char **dirs = malloc(sizeof(char *) * num_directories);
    int index = 0;

    d = opendir(dirpath);

    if (d)
    {
        while ((dir = readdir(d)) != NULL)
        {
            if (strcmp(".", dir->d_name) == 0 ||
                strcmp("..", dir->d_name) == 0)
                continue;

            char full_dirpath[MAX_DIR_LENGTH];
            sprintf(full_dirpath, "%s/%s", dirpath, dir->d_name);

            // Try using d_type to verify it's a directory. If we can't,
            // fall back to using lstat.
            if (dir->d_type == DT_DIR)
            {
                dirs[index] = strdup(full_dirpath);
                index++;
            } else
            {
                int lstat_res = lstat(full_dirpath, &statbuf);
                if (lstat_res < 0)
                {
                    // TODO: Currently, when the file structure
                    // isn't what we expect, we log an error and keep going.
                    // We do not exit(1) from the script because then it will be
                    // reported as a checksum failure, and we will not be able
                    // to differentiate between failures due to the file
                    // structure not being what this script expects, and
                    // actual checksum failures. In theory, this should never
                    // happen, but just in case in the future Postgres changes
                    // the way they structure their data files, we should have
                    // a way to differentiate between the two cases, and exit
                    // here.
                    fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n",
                            full_dirpath, strerror(errno));
                }

                if (S_ISDIR(statbuf.st_mode))
                {
                    dirs[index] = strdup(full_dirpath);
                    index++;
                } else
                {
                    // TODO: Same as the above comment; we should
                    // figure out a way to identify unexpected file structure.
                    fprintf(stderr, "ERROR: unexpected file strucuture; "
                            "expected %s to be a directory, but "
                            "statbuf.st_mode was %d. Only files are expected "
                            "under data directories.\n",
                            full_dirpath, statbuf.st_mode);
                }
            }
        }
    }
    closedir(d);
    return dirs;
}

// Scans a directory within /pgsql/data/base (Ex. /pgsql/data/base/12345).
// The elements inside this directory should all be data files.
// This method should be called via a new thread so that we can scan directories
// in parallel.
int scan_data_files(const char *dirpath) {
    DIR *d;
    struct dirent *dirent;
    struct stat statbuf;
    int corrupt_pages_found = 0;

    d = opendir(dirpath);

    if (d)
    {
        while ((dirent = readdir(d)) != NULL)
        {
            if (strstr(dirent->d_name, "pg_internal.init") != NULL ||
                strcmp(".", dirent->d_name) == 0 ||
                strcmp("..", dirent->d_name) == 0)
                continue;

            // Try using d_type to verify it's a file. If we can't, fall back
            // to using lstat.
            if (dirent->d_type == DT_REG)
            {
                corrupt_pages_found += scan_segmentfile(dirent->d_name,
                                                        dirpath);
            } else
            {
                char full_dirpath[MAX_DIR_LENGTH];
                sprintf(full_dirpath, "%s/%s", dirpath, dirent->d_name);

                int lstat_res = lstat(full_dirpath, &statbuf);
                if (lstat_res < 0)
                {
                    // TODO: Currently, when the file structure
                    // isn't what we expect, we log an error and keep going.
                    // We do not exit(1) from the script because then it will be
                    // reported as a checksum failure, and we will not be able
                    // to differentiate between failures due to the file
                    // structure not being what this script expects, and
                    // actual checksum failures. In theory, this should never
                    // happen, but just in case in the future Postgres changes
                    // the way they structure their data files, we should have
                    // a way to differentiate between the two cases, and exit
                    // here.
                    fprintf(stderr, "ERROR: lstat(%s) returned error: %s\n",
                            full_dirpath, strerror(errno));
                }

                if (S_ISREG(statbuf.st_mode))
                {
                    corrupt_pages_found +=
                        scan_segmentfile(dirent->d_name, dirpath);
                } else
                {
                    // TODO: Same as the above comment; we should
                    // figure out a way to identify unexpected file structure.
                    fprintf(stderr, "ERROR: unexpected file strucuture; "
                            "expected %s to be a file, but statbuf.st_mode was "
                            "%d. Only files are expected under data "
                            "directories.\n",
                            full_dirpath, statbuf.st_mode);
                }
            }
        }
    }

    closedir(d);
    return corrupt_pages_found;
}

// Gets the current queue index to process the data directory at that index,
// and then increments the index.
static int get_queue_index(int *index) {
  pthread_mutex_lock(&mtx);
  *index = queue_index;
  queue_index++;
  pthread_mutex_unlock(&mtx);
  return *index;
}

// Function used by each thread. It will continously look for the next
// data directory to scan from the queue, scan it, then repeat until
// the queue has been processed.
static uint32 process_directories_worker(void *dirs) {
    const char **data_directories = (const char **)dirs;
    uint32 corrupt_pages_found = 0;

    int index = 0;
    while (get_queue_index(&index) < num_directories) {
      corrupt_pages_found += scan_data_files(data_directories[index]);
    }

    return corrupt_pages_found;
}

// Builds the queue of data directories, then spins up threads to
// process the queue.
static uint32 scan_base_directory(const char *dirpath) {
    uint32 corrupt_pages_found = 0;

    count_entries(dirpath);
    char **dirs = get_data_directories(dirpath);

    // Start up the workers.
    pthread_t threads[MAX_THREAD_COUNT];
    for (int i = 0; i < MAX_THREAD_COUNT; i++) {
        pthread_create(threads + i, NULL, (void *)&process_directories_worker,
                       (void *)dirs);
    }

    // Wait for all threads to finish.
    for (int i = 0; i < MAX_THREAD_COUNT; i++) {
        int corrupt_pages;
        pthread_join(threads[i], (void **) &corrupt_pages);
        corrupt_pages_found += corrupt_pages;
    }

    // Free everything.
    for (int i = 0; i < num_directories; i++) {
      free(dirs[i]);
    }
    free(dirs);

    return corrupt_pages_found;
}

static uint32
scan_directory(const char *dirpath)
{
    /* scope is only to check the base directory, where the actual data files
     * are located.  Other directories contain temporary files used for
     * transactions or queries in progress and should not be checked.
     *
     * Postgres stores data files in one directory per database defined,
     * without additional nesting or leafs.  This causes depth of database
     * directories to always be one.
     */

    DIR *d;
    struct dirent *dir;
    struct stat statbuf;
    uint32 corrupt_pages_found = 0;

    d = opendir(dirpath);

    if (verbose)
        printf("DEBUG: called scan_directory(%s)\n", dirpath);

    if (d)
    {
        chdir(dirpath);
        while ((dir = readdir(d)) != NULL)
        {
            lstat(dir->d_name, &statbuf);

            if (verbose)
                printf("DEBUG: direntry: %s/%s - statbuf.st_mode: %d\n",
                    dirpath, dir->d_name, statbuf.st_mode);

            if (S_ISDIR(statbuf.st_mode))
            {
                if(strcmp(".", dir->d_name) == 0 ||
                    strcmp("..", dir->d_name) == 0)
                    continue;

                char new_dirpath[MAX_DIR_LENGTH];
                sprintf(new_dirpath, "%s/%s", dirpath, dir->d_name);

                corrupt_pages_found += scan_directory(new_dirpath);
            }
            else if (S_ISREG(statbuf.st_mode))
            {
                corrupt_pages_found += scan_segmentfile(dir->d_name, dirpath);
            }
        }
        closedir(d);
    }

    return corrupt_pages_found;
}

static void
print_help(const char *argv_value)
{
    printf("Usage: %s [OPTIONS]\n", argv_value);
    printf("  -v                        verbose\n");
    printf("  -D directory              data directory\n");
    printf("  -h, --help                print this help and exit\n");
    printf("  -p                        scans data directories in parallel\n");
    printf("\n");
}

int
main(int argc, char *argv[])
{
    /*
     * Flow is to use a data directory and traverse to find segments, each
     * segment file is then scanned for corrupt pages.
     */

    int c;
    uint32 corrupted_pages_found = 0;
    const char *short_opt = "chD:vp";
    char datadir[MAX_DIR_LENGTH];
    struct stat statbuf;
    struct option long_opt[] =
    {
        {"dumpcorrupted", no_argument,       NULL, 'c'},
        {"datadir",       required_argument, NULL, 'D'},
        {"help",          no_argument,       NULL, 'h'},
        {"verbose",       no_argument,       NULL, 'v'},
        {"parallel",      no_argument,       NULL, 'p'},
        {NULL,            0,                 NULL, 0  }
    };

    /* if no arguments passed, print help and exit(1) */
    if (argc == 1)
    {
        print_help(argv[0]);
        exit(1);
    }

    while((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1)
    {
        switch(c)
        {
            case -1:       /* no more arguments */
            case 0:        /* long options toggles */
                break;

            case 'c':
                dump_corrupted = 1;
                break;

            case 'v':
                verbose = 1;
                break;

            case 'p':
                parallel = true;
                break;

            case 'D':
                if (optarg)
                {
                    /* '-D' argument of data directory must contain 'base'
                     * directory to scan, check if trailing back slash and
                     * append appropriately.  Must use base since other
                     * directories, such as pg_xlog, contain currently
                     * unsupported file types.
                     */
                    strcpy(datadir, optarg);
                    if (strcmp(&optarg[strlen(optarg) - 1], "/") == 0)
                    {
                        strcat(datadir, "base");
                    }
                    else
                    {
                        strcat(datadir, "/base");
                    }
                }
                else
                {
                    fprintf(stderr, "ERROR: -D argument could not be parsed\n");
                    exit(1);
                }
                break;

            case 'h':
                print_help(argv[0]);
                exit(1);

            default:
                fprintf(stderr, "ERROR: %s: invalid option -- %c\n", argv[0], c);
                print_help(argv[0]);
                exit(1);
        };
    };

    lstat(datadir, &statbuf);
    if (!S_ISDIR(statbuf.st_mode))
    {
        fprintf(stderr, "ERROR: base %s is not a directory\n", datadir);
        exit(1);
    }

    if (parallel)
    {
        corrupted_pages_found = scan_base_directory(datadir);
    } else {
        corrupted_pages_found = scan_directory(datadir);
    }

    if (corrupted_pages_found > 0)
    {
        printf("CORRUPTION FOUND: %d\n", corrupted_pages_found);
        exit(1);
    }
    else
    {
        printf("NO CORRUPTION FOUND\n");
        exit(0);
    }
}
