src/library/stats/R/fisher.test.R - R - Git at Google

 #  File src/library/stats/R/fisher.test.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2017 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 fisher.test <-
 function(x, y = NULL, workspace = 200000, hybrid = FALSE,
          hybridPars = c(expect = 5, percent = 80, Emin = 1),
          control = list(), or = 1, alternative = "two.sided",
          conf.int = TRUE, conf.level = 0.95,
          simulate.p.value = FALSE, B = 2000)
 {
     DNAME <- deparse(substitute(x))
     METHOD <- "Fisher's Exact Test for Count Data"
     if(is.data.frame(x))
         x <- as.matrix(x)
     if(is.matrix(x)) {
         if(any(dim(x) < 2L))
             stop("'x' must have at least 2 rows and columns")
         if(!is.numeric(x) || any(x < 0) || anyNA(x))
             stop("all entries of 'x' must be nonnegative and finite")
         if(!is.integer(x)) {
             xo <- x
             x <- round(x)
             if(any(x > .Machine$integer.max))
                 stop("'x' has entries too large to be integer")
             if(!identical(TRUE, (ax <- all.equal(xo, x))))
                 warning(gettextf("'x' has been rounded to integer: %s", ax),
                         domain = NA)
             storage.mode(x) <- "integer"
         }
     }
     else {
         if(is.null(y))
             stop("if 'x' is not a matrix, 'y' must be given")
         if(length(x) != length(y))
             stop("'x' and 'y' must have the same length")
         DNAME <- paste(DNAME, "and", deparse(substitute(y)))
         OK <- complete.cases(x, y)
         ## use as.factor rather than factor here to be consistent with
         ## pre-tabulated data
         x <- as.factor(x[OK])
         y <- as.factor(y[OK])
         if((nlevels(x) < 2L) || (nlevels(y) < 2L))
             stop("'x' and 'y' must have at least 2 levels")
         x <- table(x, y)
     }
     ## x is integer
     con <- list(mult = 30)
     con[names(control)] <- control
     if((mult <- as.integer(con$mult)) < 2)
         stop("'mult' must be integer >= 2, typically = 30")

     nr <- nrow(x)
     nc <- ncol(x)
     have.2x2 <- (nr == 2) && (nc == 2)
     if(have.2x2) {
         alternative <- char.expand(alternative,
                                    c("two.sided", "less", "greater"))
         if(length(alternative) > 1L || is.na(alternative))
             stop("alternative must be \"two.sided\", \"less\" or \"greater\"")
         if(!((length(conf.level) == 1L) && is.finite(conf.level) &&
              (conf.level > 0) && (conf.level < 1)))
             stop("'conf.level' must be a single number between 0 and 1")
         if(!missing(or) && (length(or) > 1L || is.na(or) || or < 0))
             stop("'or' must be a single number between 0 and Inf")
     }

     PVAL <- NULL
     if(!have.2x2) {
         if(simulate.p.value) {
             ## we drop all-zero rows and columns
             sr <- rowSums(x)
             sc <- colSums(x)
             x <- x[sr > 0, sc > 0, drop = FALSE]
             nr <- as.integer(nrow(x))
             nc <- as.integer(ncol(x))
             if (is.na(nr) || is.na(nc) || is.na(nr * nc))
                 stop("invalid nrow(x) or ncol(x)", domain = NA)
             if(nr <= 1L)
                 stop("need 2 or more non-zero row marginals")
             if(nc <= 1L)
                 stop("need 2 or more non-zero column marginals")
             METHOD <- paste(METHOD, "with simulated p-value\n\t (based on", B,
 			     "replicates)")
             STATISTIC <- -sum(lfactorial(x))
             tmp <- .Call(C_Fisher_sim, rowSums(x), colSums(x), B)
 	    ## use correct significance level for a Monte Carlo test
             almost.1 <- 1 + 64 * .Machine$double.eps
             ## PR#10558: STATISTIC is negative
 	    PVAL <- (1 + sum(tmp <= STATISTIC/almost.1)) / (B + 1)
         } else if(hybrid) {
             ## Cochran condition for asym.chisq. decision is
             ## hybridPars = c(expect = 5, percent = 80, Emin = 1),
             if(!is.null(nhP <- names(hybridPars)) &&
                !identical(nhP, c("expect", "percent", "Emin")))
                 stop("names(hybridPars) should be NULL or be identical to the default's")
             stopifnot(is.double(hypp <- as.double(hybridPars)),
                       length(hypp) == 3L,
                       hypp[1] > 0, hypp[3] >= 0,
                       0 <= hypp[2], hypp[2] <= 100)
             PVAL <- .Call(C_Fexact, x, hypp, workspace, mult)
             METHOD <- paste(METHOD, sprintf("hybrid using asym.chisq. iff (exp=%g, perc=%g, Emin=%g)",
 					    hypp[1], hypp[2], hypp[3]))
          } else {
             ##  expect < 0 : exact
             PVAL <- .Call(C_Fexact, x, c(-1, 100, 0), workspace, mult)
         }

         RVAL <- list(p.value = max(0, min(1, PVAL)))
     } else { ## conf.int and more only in  2 x 2 case
         if(hybrid) warning("'hybrid' is ignored for a 2 x 2 table")
         m <- sum(x[, 1L])
         n <- sum(x[, 2L])
         k <- sum(x[1L, ])
         x <- x[1L, 1L]
         lo <- max(0L, k - n)
         hi <- min(k, m)
         NVAL <- c("odds ratio" = or)

         ## Note that in general the conditional distribution of x given
         ## the marginals is a non-central hypergeometric distribution H
         ## with non-centrality parameter ncp, the odds ratio.
         support <- lo : hi
         ## Density of the *central* hypergeometric distribution on its
         ## support: store for once as this is needed quite a bit.
         logdc <- dhyper(support, m, n, k, log = TRUE)
         dnhyper <- function(ncp) {
             ## Does not work for boundary values for ncp (0, Inf) but it
             ## does not need to.
             d <- logdc + log(ncp) * support
             d <- exp(d - max(d))        # beware of overflow
             d / sum(d)
         }
         mnhyper <- function(ncp) {
             if(ncp == 0)
                 return(lo)
             if(ncp == Inf)
                 return(hi)
             sum(support * dnhyper(ncp))
         }
         pnhyper <- function(q, ncp = 1, upper.tail = FALSE) {
 	    if(ncp == 1) {
 		return(if(upper.tail)
 		       phyper(x - 1, m, n, k, lower.tail = FALSE) else
 		       phyper(x,     m, n, k))
 	    }
 	    if(ncp == 0) {
 		return(as.numeric(if(upper.tail) q <= lo else q >= lo))
 	    }
 	    if(ncp == Inf) {
 		return(as.numeric(if(upper.tail) q <= hi else q >= hi))
 	    }
 	    ## else
 	    sum(dnhyper(ncp)[if(upper.tail) support >= q else support <= q])
         }

         ## Determine the p-value (if still necessary).
         if(is.null(PVAL)) {
             PVAL <-
                 switch(alternative,
                        less = pnhyper(x, or),
                        greater = pnhyper(x, or, upper.tail = TRUE),
                        two.sided = {
                            if(or == 0)
                                as.numeric(x == lo)
                            else if(or == Inf)
                                as.numeric(x == hi)
                            else {
                                ## Note that we need a little fuzz.
                                relErr <- 1 + 10 ^ (-7)
                                d <- dnhyper(or)
                                sum(d[d <= d[x - lo + 1] * relErr])
                            }
                        })
             RVAL <- list(p.value = PVAL)
         }

         ## Determine the MLE for ncp by solving E(X) = x, where the
         ## expectation is with respect to H.
         mle <- function(x) {
             if(x == lo)
                 return(0)
             if(x == hi)
                 return(Inf)
             mu <- mnhyper(1)
             if(mu > x)
                 uniroot(function(t) mnhyper(t) - x, c(0, 1))$root
             else if(mu < x)
                 1 / uniroot(function(t) mnhyper(1/t) - x,
                             c(.Machine$double.eps, 1))$root
             else
                 1
         }
         ESTIMATE <- c("odds ratio" = mle(x))

         if(conf.int) {
             ## Determine confidence intervals for the odds ratio.
             ncp.U <- function(x, alpha) {
                 if(x == hi)
                     return(Inf)
                 p <- pnhyper(x, 1)
                 if(p < alpha)
                     uniroot(function(t) pnhyper(x, t) - alpha,
                             c(0, 1))$root
                 else if(p > alpha)
                     1 / uniroot(function(t) pnhyper(x, 1/t) - alpha,
                                 c(.Machine$double.eps, 1))$root
                 else
                     1
             }
             ncp.L <- function(x, alpha) {
                 if(x == lo)
                     return(0)

                 p <- pnhyper(x, 1, upper.tail = TRUE)
                 if(p > alpha)
                     uniroot(function(t)
                             pnhyper(x, t, upper.tail = TRUE) - alpha,
                             c(0, 1))$root
                 else if(p < alpha)
                     1 / uniroot(function(t)
                                 pnhyper(x, 1/t, upper.tail = TRUE) - alpha,
                                 c(.Machine$double.eps, 1))$root
                 else
                     1
             }
             CINT <- switch(alternative,
                            less = c(0, ncp.U(x, 1 - conf.level)),
                            greater = c(ncp.L(x, 1 - conf.level), Inf),
                            two.sided = {
                                alpha <- (1 - conf.level) / 2
                                c(ncp.L(x, alpha), ncp.U(x, alpha))
                            })
             attr(CINT, "conf.level") <- conf.level
         }
         RVAL <- c(RVAL,
                   list(conf.int = if(conf.int) CINT,
                        estimate = ESTIMATE,
                        null.value = NVAL))
     } ## end (2 x 2)

     structure(c(RVAL,
                 alternative = alternative,
                 method = METHOD,
                 data.name = DNAME),
               class = "htest")
 }
	# File src/library/stats/R/fisher.test.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2017 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	fisher.test <-
	function(x, y = NULL, workspace = 200000, hybrid = FALSE,
	hybridPars = c(expect = 5, percent = 80, Emin = 1),
	control = list(), or = 1, alternative = "two.sided",
	conf.int = TRUE, conf.level = 0.95,
	simulate.p.value = FALSE, B = 2000)
	{
	DNAME <- deparse(substitute(x))
	METHOD <- "Fisher's Exact Test for Count Data"
	if(is.data.frame(x))
	x <- as.matrix(x)
	if(is.matrix(x)) {
	if(any(dim(x) < 2L))
	stop("'x' must have at least 2 rows and columns")
	if(!is.numeric(x) \|\| any(x < 0) \|\| anyNA(x))
	stop("all entries of 'x' must be nonnegative and finite")
	if(!is.integer(x)) {
	xo <- x
	x <- round(x)
	if(any(x > .Machine$integer.max))
	stop("'x' has entries too large to be integer")
	if(!identical(TRUE, (ax <- all.equal(xo, x))))
	warning(gettextf("'x' has been rounded to integer: %s", ax),
	domain = NA)
	storage.mode(x) <- "integer"
	}
	}
	else {
	if(is.null(y))
	stop("if 'x' is not a matrix, 'y' must be given")
	if(length(x) != length(y))
	stop("'x' and 'y' must have the same length")
	DNAME <- paste(DNAME, "and", deparse(substitute(y)))
	OK <- complete.cases(x, y)
	## use as.factor rather than factor here to be consistent with
	## pre-tabulated data
	x <- as.factor(x[OK])
	y <- as.factor(y[OK])
	if((nlevels(x) < 2L) \|\| (nlevels(y) < 2L))
	stop("'x' and 'y' must have at least 2 levels")
	x <- table(x, y)
	}
	## x is integer
	con <- list(mult = 30)
	con[names(control)] <- control
	if((mult <- as.integer(con$mult)) < 2)
	stop("'mult' must be integer >= 2, typically = 30")

	nr <- nrow(x)
	nc <- ncol(x)
	have.2x2 <- (nr == 2) && (nc == 2)
	if(have.2x2) {
	alternative <- char.expand(alternative,
	c("two.sided", "less", "greater"))
	if(length(alternative) > 1L \|\| is.na(alternative))
	stop("alternative must be \"two.sided\", \"less\" or \"greater\"")
	if(!((length(conf.level) == 1L) && is.finite(conf.level) &&
	(conf.level > 0) && (conf.level < 1)))
	stop("'conf.level' must be a single number between 0 and 1")
	if(!missing(or) && (length(or) > 1L \|\| is.na(or) \|\| or < 0))
	stop("'or' must be a single number between 0 and Inf")
	}

	PVAL <- NULL
	if(!have.2x2) {
	if(simulate.p.value) {
	## we drop all-zero rows and columns
	sr <- rowSums(x)
	sc <- colSums(x)
	x <- x[sr > 0, sc > 0, drop = FALSE]
	nr <- as.integer(nrow(x))
	nc <- as.integer(ncol(x))
	if (is.na(nr) \|\| is.na(nc) \|\| is.na(nr * nc))
	stop("invalid nrow(x) or ncol(x)", domain = NA)
	if(nr <= 1L)
	stop("need 2 or more non-zero row marginals")
	if(nc <= 1L)
	stop("need 2 or more non-zero column marginals")
	METHOD <- paste(METHOD, "with simulated p-value\n\t (based on", B,
	"replicates)")
	STATISTIC <- -sum(lfactorial(x))
	tmp <- .Call(C_Fisher_sim, rowSums(x), colSums(x), B)
	## use correct significance level for a Monte Carlo test
	almost.1 <- 1 + 64 * .Machine$double.eps
	## PR#10558: STATISTIC is negative
	PVAL <- (1 + sum(tmp <= STATISTIC/almost.1)) / (B + 1)
	} else if(hybrid) {
	## Cochran condition for asym.chisq. decision is
	## hybridPars = c(expect = 5, percent = 80, Emin = 1),
	if(!is.null(nhP <- names(hybridPars)) &&
	!identical(nhP, c("expect", "percent", "Emin")))
	stop("names(hybridPars) should be NULL or be identical to the default's")
	stopifnot(is.double(hypp <- as.double(hybridPars)),
	length(hypp) == 3L,
	hypp[1] > 0, hypp[3] >= 0,
	0 <= hypp[2], hypp[2] <= 100)
	PVAL <- .Call(C_Fexact, x, hypp, workspace, mult)
	METHOD <- paste(METHOD, sprintf("hybrid using asym.chisq. iff (exp=%g, perc=%g, Emin=%g)",
	hypp[1], hypp[2], hypp[3]))
	} else {
	## expect < 0 : exact
	PVAL <- .Call(C_Fexact, x, c(-1, 100, 0), workspace, mult)
	}

	RVAL <- list(p.value = max(0, min(1, PVAL)))
	} else { ## conf.int and more only in 2 x 2 case
	if(hybrid) warning("'hybrid' is ignored for a 2 x 2 table")
	m <- sum(x[, 1L])
	n <- sum(x[, 2L])
	k <- sum(x[1L, ])
	x <- x[1L, 1L]
	lo <- max(0L, k - n)
	hi <- min(k, m)
	NVAL <- c("odds ratio" = or)

	## Note that in general the conditional distribution of x given
	## the marginals is a non-central hypergeometric distribution H
	## with non-centrality parameter ncp, the odds ratio.
	support <- lo : hi
	## Density of the central hypergeometric distribution on its
	## support: store for once as this is needed quite a bit.
	logdc <- dhyper(support, m, n, k, log = TRUE)
	dnhyper <- function(ncp) {
	## Does not work for boundary values for ncp (0, Inf) but it
	## does not need to.
	d <- logdc + log(ncp) * support
	d <- exp(d - max(d)) # beware of overflow
	d / sum(d)
	}
	mnhyper <- function(ncp) {
	if(ncp == 0)
	return(lo)
	if(ncp == Inf)
	return(hi)
	sum(support * dnhyper(ncp))
	}
	pnhyper <- function(q, ncp = 1, upper.tail = FALSE) {
	if(ncp == 1) {
	return(if(upper.tail)
	phyper(x - 1, m, n, k, lower.tail = FALSE) else
	phyper(x, m, n, k))
	}
	if(ncp == 0) {
	return(as.numeric(if(upper.tail) q <= lo else q >= lo))
	}
	if(ncp == Inf) {
	return(as.numeric(if(upper.tail) q <= hi else q >= hi))
	}
	## else
	sum(dnhyper(ncp)[if(upper.tail) support >= q else support <= q])
	}

	## Determine the p-value (if still necessary).
	if(is.null(PVAL)) {
	PVAL <-
	switch(alternative,
	less = pnhyper(x, or),
	greater = pnhyper(x, or, upper.tail = TRUE),
	two.sided = {
	if(or == 0)
	as.numeric(x == lo)
	else if(or == Inf)
	as.numeric(x == hi)
	else {
	## Note that we need a little fuzz.
	relErr <- 1 + 10 ^ (-7)
	d <- dnhyper(or)
	sum(d[d <= d[x - lo + 1] * relErr])
	}
	})
	RVAL <- list(p.value = PVAL)
	}

	## Determine the MLE for ncp by solving E(X) = x, where the
	## expectation is with respect to H.
	mle <- function(x) {
	if(x == lo)
	return(0)
	if(x == hi)
	return(Inf)
	mu <- mnhyper(1)
	if(mu > x)
	uniroot(function(t) mnhyper(t) - x, c(0, 1))$root
	else if(mu < x)
	1 / uniroot(function(t) mnhyper(1/t) - x,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	ESTIMATE <- c("odds ratio" = mle(x))

	if(conf.int) {
	## Determine confidence intervals for the odds ratio.
	ncp.U <- function(x, alpha) {
	if(x == hi)
	return(Inf)
	p <- pnhyper(x, 1)
	if(p < alpha)
	uniroot(function(t) pnhyper(x, t) - alpha,
	c(0, 1))$root
	else if(p > alpha)
	1 / uniroot(function(t) pnhyper(x, 1/t) - alpha,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	ncp.L <- function(x, alpha) {
	if(x == lo)
	return(0)

	p <- pnhyper(x, 1, upper.tail = TRUE)
	if(p > alpha)
	uniroot(function(t)
	pnhyper(x, t, upper.tail = TRUE) - alpha,
	c(0, 1))$root
	else if(p < alpha)
	1 / uniroot(function(t)
	pnhyper(x, 1/t, upper.tail = TRUE) - alpha,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	CINT <- switch(alternative,
	less = c(0, ncp.U(x, 1 - conf.level)),
	greater = c(ncp.L(x, 1 - conf.level), Inf),
	two.sided = {
	alpha <- (1 - conf.level) / 2
	c(ncp.L(x, alpha), ncp.U(x, alpha))
	})
	attr(CINT, "conf.level") <- conf.level
	}
	RVAL <- c(RVAL,
	list(conf.int = if(conf.int) CINT,
	estimate = ESTIMATE,
	null.value = NVAL))
	} ## end (2 x 2)

	structure(c(RVAL,
	alternative = alternative,
	method = METHOD,
	data.name = DNAME),
	class = "htest")
	}