src/library/stats/R/mantelhaen.test.R - R - Git at Google

 #  File src/library/stats/R/mantelhaen.test.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2018 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 mantelhaen.test <-
 function(x, y = NULL, z = NULL,
          alternative = c("two.sided", "less", "greater"),
          correct = TRUE, exact = FALSE, conf.level = 0.95)
 {
     DNAME <- deparse(substitute(x))
     if(is.array(x)) {
         if(length(dim(x)) == 3L) {
             if(anyNA(x)) stop("NAs are not allowed")
             if(any(dim(x) < 2L)) stop("each dimension in table must be >= 2")
         }
         else
             stop("'x' must be a 3-dimensional array")
     }
     else {
         if(is.null(y)) stop("if 'x' is not an array, 'y' must be given")
         if(is.null(z)) stop("if 'x' is not an array, 'z' must be given")
         if(any(diff(c(length(x), length(y), length(z))) != 0L ))
             stop("'x', 'y', and 'z' must have the same length")
         DNAME <- paste(DNAME, "and", deparse(substitute(y)), "and",
                        deparse(substitute(z)))
         OK <- complete.cases(x, y, z)
         x <- factor(x[OK])
         y <- factor(y[OK])
         if((nlevels(x) < 2L) || (nlevels(y) < 2L))
             stop("'x' and 'y' must have at least 2 levels")
         else
             x <- table(x, y, z[OK])
     }

     if(any(apply(x, 3L, sum) < 2))
         stop("sample size in each stratum must be > 1")

     I <- dim(x)[1L]
     J <- dim(x)[2L]
     K <- dim(x)[3L]

     if((I == 2) && (J == 2)) {
         ## 2 x 2 x K case
         alternative <- match.arg(alternative)
         if(!missing(conf.level) &&
            (length(conf.level) != 1 || !is.finite(conf.level) ||
             conf.level < 0 || conf.level > 1))
             stop("'conf.level' must be a single number between 0 and 1")

         NVAL <- c("common odds ratio" = 1)

         if(!exact) {
             ## Classical Mantel-Haenszel 2 x 2 x K test
             s.x <- apply(x, c(1L, 3L), sum)
             s.y <- apply(x, c(2L, 3L), sum)
             n <- as.double(apply(x, 3L, sum)) # avoid overflows below
             DELTA <- sum(x[1, 1, ] - s.x[1, ] * s.y[1, ] / n)
 	    YATES <- if(correct && (abs(DELTA) >= .5)) .5 else 0
             STATISTIC <- ((abs(DELTA) - YATES)^2 /
                           sum(apply(rbind(s.x, s.y), 2L, prod)
                               / (n^2 * (n - 1))))
             PARAMETER <- 1
             if (alternative == "two.sided")
                 PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
             else {
                 z <- sign(DELTA) * sqrt(STATISTIC)
                 PVAL <- pnorm(z, lower.tail = (alternative == "less"))
             }

             names(STATISTIC) <- "Mantel-Haenszel X-squared"
             names(PARAMETER) <- "df"
             METHOD <- paste("Mantel-Haenszel chi-squared test",
                             if(YATES) "with" else "without",
                             "continuity correction")
             s.diag <- sum(x[1L, 1L, ] * x[2L, 2L, ] / n)
             s.offd <- sum(x[1L, 2L, ] * x[2L, 1L, ] / n)
             ## Mantel-Haenszel (1959) estimate of the common odds ratio.
             ESTIMATE <- s.diag / s.offd
             ## Robins et al. (1986) estimate of the standard deviation
             ## of the log of the Mantel-Haenszel estimator.
             sd <-
                 sqrt(  sum((x[1L,1L,] + x[2L,2L,]) * x[1L,1L,] * x[2L,2L,]
                            / n^2)
                      / (2 * s.diag^2)
                      + sum((  (x[1L,1L,] + x[2L,2L,]) * x[1L,2L,] * x[2L,1L,]
                             + (x[1L,2L,] + x[2L,1L,]) * x[1L,1L,] * x[2L,2L,])
                            / n^2)
                      / (2 * s.diag * s.offd)
                      + sum((x[1L,2L,] + x[2L,1L,]) * x[1L,2L,] * x[2L,1L,]
                            / n^2)
                      / (2 * s.offd^2))
             CINT <-
                 switch(alternative,
                        less = c(0, ESTIMATE * exp(qnorm(conf.level) * sd)),
                        greater = c(ESTIMATE * exp(qnorm(conf.level,
                                    lower.tail = FALSE) * sd), Inf),
                        two.sided = {
                            ESTIMATE * exp(c(1, -1) *
                                           qnorm((1 - conf.level) / 2) * sd)
                        })
             RVAL <- list(statistic = STATISTIC,
                          parameter = PARAMETER,
                          p.value = PVAL)
         }
         else {
             ## Exact inference for the 2 x 2 x k case can be carried out
             ## conditional on the strata margins, similar to the case
             ## for Fisher's exact test (k = 1).  Again, the distribution
             ## of S (in our case, sum(x[2, 1, ]) to be consistent with
             ## the notation in Mehta et al. (1985), is of the form
             ##    P(S = s) \propto d(s) * or^s,   lo <= s <= hi
             ## where or is the common odds ratio in the k tables (and
             ## d(.) is a product hypergeometric distribution).

             METHOD <- paste("Exact conditional test of independence",
                             "in 2 x 2 x k tables")
             mn <- apply(x, c(2L, 3L), sum)
             m <- mn[1L, ]
             n <- mn[2L, ]
             t <- apply(x, c(1L, 3L), sum)[1L, ]
             s <- sum(x[1L, 1L, ])
             lo <- sum(pmax(0, t - n))
             hi <- sum(pmin(m, t))

             support <- lo : hi
             ## Density of the *central* product hypergeometric
             ## distribution on its support: store for once as this is
             ## needed quite a bit.

             dc <- .Call(C_d2x2xk, K, m, n, t, hi - lo + 1L)
             logdc <- log(dc)

             dn2x2xk <- function(ncp) {
                 ## Does not work for boundary values for ncp (0, Inf)
                 ## but it does not need to.
                 if(ncp == 1) return(dc)
                 d <- logdc + log(ncp) * support
                 d <- exp(d - max(d))    # beware of overflow
                 d / sum(d)
             }
             mn2x2xk <- function(ncp) {
                 if(ncp == 0)
                     return(lo)
                 if(ncp == Inf)
                     return(hi)
                 sum(support * dn2x2xk(ncp))
             }
             pn2x2xk <- function(q, ncp = 1, upper.tail = FALSE) {
                 if(ncp == 0) {
                     as.numeric(if(upper.tail) q <= lo else q >= lo)
                 }
                 else if(ncp == Inf)
                     as.numeric(if(upper.tail) q <= hi else q >= hi)
                 else {
                     d <- dn2x2xk(ncp)
                     sum(d[if(upper.tail) support >= q else support <= q])
                 }
             }

             ## Determine the p-value.
             PVAL <-
                 switch(alternative,
                        less = pn2x2xk(s, 1),
                        greater = pn2x2xk(s, 1, upper.tail = TRUE),
                        two.sided = {
                            ## Note that we need a little fuzz.
                            relErr <- 1 + 10 ^ (-7)
                            d <- dc      # same as dn2x2xk(1)
                            sum(d[d <= d[s - lo + 1] * relErr])
                        })

             ## Determine the MLE for ncp by solving E(S) = s, where the
             ## expectation is with respect to the above distribution.
             mle <- function(x) {
                 if(x == lo)
                     return(0)
                 if(x == hi)
                     return(Inf)
                 mu <- mn2x2xk(1)
                 if(mu > x)
                     uniroot(function(t) mn2x2xk(t) - x,
                             c(0, 1))$root
                 else if(mu < x)
                     1 / uniroot(function(t) mn2x2xk(1/t) - x,
                                 c(.Machine$double.eps, 1))$root
                 else
                     1
             }
             ESTIMATE <- mle(s)

             ## Determine confidence intervals for the odds ratio.
             ncp.U <- function(x, alpha) {
                 if(x == hi)
                     return(Inf)
                 p <- pn2x2xk(x, 1)
                 if(p < alpha)
                     uniroot(function(t) pn2x2xk(x, t) - alpha,
                             c(0, 1))$root
                 else if(p > alpha)
                     1 / uniroot(function(t) pn2x2xk(x, 1/t) - alpha,
                                 c(.Machine$double.eps, 1))$root
                 else
                     1
             }
             ncp.L <- function(x, alpha) {
                 if(x == lo)
                     return(0)
                 p <- pn2x2xk(x, 1, upper.tail = TRUE)
                 if(p > alpha)
                     uniroot(function(t)
                             pn2x2xk(x, t, upper.tail = TRUE) - alpha,
                             c(0, 1))$root
             else if (p < alpha)
                 1 / uniroot(function(t)
                             pn2x2xk(x, 1/t, upper.tail = TRUE) - alpha,
                             c(.Machine$double.eps, 1))$root
             else
                 1
             }
             CINT <- switch(alternative,
                            less = c(0, ncp.U(s, 1 - conf.level)),
                            greater = c(ncp.L(s, 1 - conf.level), Inf),
                            two.sided = {
                                alpha <- (1 - conf.level) / 2
                                c(ncp.L(s, alpha), ncp.U(s, alpha))
                            })

             STATISTIC <- c(S = s)
             RVAL <- list(statistic = STATISTIC,
                          p.value = PVAL)
         }

         names(ESTIMATE) <- names(NVAL)
         attr(CINT, "conf.level") <- conf.level
         RVAL <- c(RVAL,
                   list(conf.int = CINT,
                        estimate = ESTIMATE,
                        null.value = NVAL,
                        alternative = alternative))
     }
     else {
         ## Generalized Cochran-Mantel-Haenszel I x J x K test
         ## Agresti (1990), pages 234--235.
         ## Agresti (2002), pages 295ff.
         ## Note that n in the reference is in column-major order.
         ## (Thanks to Torsten Hothorn for spotting this.)
         df <- (I - 1) * (J - 1)
         n <- m <- double(length = df)
         V <- matrix(0, nrow = df, ncol = df)
         for (k in 1 : K) {
             f <- x[ , , k]              # frequencies in stratum k
             ntot <- as.double(sum(f))   # n_{..k}   (and avoid overflow)
             rowsums <- rowSums(f)[-I]   # n_{i.k}, i = 1 to I-1
             colsums <- colSums(f)[-J]   # n_{.jk}, j = 1 to J-1
             n <- n + c(f[-I, -J])
             m <- m + c(outer(rowsums, colsums, "*")) / ntot
             V <- V + (kronecker(diag(ntot * colsums, nrow = J - 1L)
                                 - outer(colsums, colsums),
                                 diag(ntot * rowsums, nrow = I - 1L)
                                 - outer(rowsums, rowsums))
                       / (ntot^2 * (ntot - 1)))
         }
         n <- n - m
         STATISTIC <- c(crossprod(n, qr.solve(V, n)))
         PARAMETER <- df
         PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
         names(STATISTIC) <- "Cochran-Mantel-Haenszel M^2"
         names(PARAMETER) <- "df"
         METHOD <- "Cochran-Mantel-Haenszel test"
         RVAL <- list(statistic = STATISTIC,
                      parameter = PARAMETER,
                      p.value = PVAL)
     }
     structure(c(RVAL,
                 list(method = METHOD,
                      data.name = DNAME)),
               class = "htest")
 }
	# File src/library/stats/R/mantelhaen.test.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2018 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	mantelhaen.test <-
	function(x, y = NULL, z = NULL,
	alternative = c("two.sided", "less", "greater"),
	correct = TRUE, exact = FALSE, conf.level = 0.95)
	{
	DNAME <- deparse(substitute(x))
	if(is.array(x)) {
	if(length(dim(x)) == 3L) {
	if(anyNA(x)) stop("NAs are not allowed")
	if(any(dim(x) < 2L)) stop("each dimension in table must be >= 2")
	}
	else
	stop("'x' must be a 3-dimensional array")
	}
	else {
	if(is.null(y)) stop("if 'x' is not an array, 'y' must be given")
	if(is.null(z)) stop("if 'x' is not an array, 'z' must be given")
	if(any(diff(c(length(x), length(y), length(z))) != 0L ))
	stop("'x', 'y', and 'z' must have the same length")
	DNAME <- paste(DNAME, "and", deparse(substitute(y)), "and",
	deparse(substitute(z)))
	OK <- complete.cases(x, y, z)
	x <- factor(x[OK])
	y <- factor(y[OK])
	if((nlevels(x) < 2L) \|\| (nlevels(y) < 2L))
	stop("'x' and 'y' must have at least 2 levels")
	else
	x <- table(x, y, z[OK])
	}

	if(any(apply(x, 3L, sum) < 2))
	stop("sample size in each stratum must be > 1")

	I <- dim(x)[1L]
	J <- dim(x)[2L]
	K <- dim(x)[3L]

	if((I == 2) && (J == 2)) {
	## 2 x 2 x K case
	alternative <- match.arg(alternative)
	if(!missing(conf.level) &&
	(length(conf.level) != 1 \|\| !is.finite(conf.level) \|\|
	conf.level < 0 \|\| conf.level > 1))
	stop("'conf.level' must be a single number between 0 and 1")

	NVAL <- c("common odds ratio" = 1)

	if(!exact) {
	## Classical Mantel-Haenszel 2 x 2 x K test
	s.x <- apply(x, c(1L, 3L), sum)
	s.y <- apply(x, c(2L, 3L), sum)
	n <- as.double(apply(x, 3L, sum)) # avoid overflows below
	DELTA <- sum(x[1, 1, ] - s.x[1, ] * s.y[1, ] / n)
	YATES <- if(correct && (abs(DELTA) >= .5)) .5 else 0
	STATISTIC <- ((abs(DELTA) - YATES)^2 /
	sum(apply(rbind(s.x, s.y), 2L, prod)
	/ (n^2 * (n - 1))))
	PARAMETER <- 1
	if (alternative == "two.sided")
	PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
	else {
	z <- sign(DELTA) * sqrt(STATISTIC)
	PVAL <- pnorm(z, lower.tail = (alternative == "less"))
	}

	names(STATISTIC) <- "Mantel-Haenszel X-squared"
	names(PARAMETER) <- "df"
	METHOD <- paste("Mantel-Haenszel chi-squared test",
	if(YATES) "with" else "without",
	"continuity correction")
	s.diag <- sum(x[1L, 1L, ] * x[2L, 2L, ] / n)
	s.offd <- sum(x[1L, 2L, ] * x[2L, 1L, ] / n)
	## Mantel-Haenszel (1959) estimate of the common odds ratio.
	ESTIMATE <- s.diag / s.offd
	## Robins et al. (1986) estimate of the standard deviation
	## of the log of the Mantel-Haenszel estimator.
	sd <-
	sqrt( sum((x[1L,1L,] + x[2L,2L,]) * x[1L,1L,] * x[2L,2L,]
	/ n^2)
	/ (2 * s.diag^2)
	+ sum(( (x[1L,1L,] + x[2L,2L,]) * x[1L,2L,] * x[2L,1L,]
	+ (x[1L,2L,] + x[2L,1L,]) * x[1L,1L,] * x[2L,2L,])
	/ n^2)
	/ (2 * s.diag * s.offd)
	+ sum((x[1L,2L,] + x[2L,1L,]) * x[1L,2L,] * x[2L,1L,]
	/ n^2)
	/ (2 * s.offd^2))
	CINT <-
	switch(alternative,
	less = c(0, ESTIMATE * exp(qnorm(conf.level) * sd)),
	greater = c(ESTIMATE * exp(qnorm(conf.level,
	lower.tail = FALSE) * sd), Inf),
	two.sided = {
	ESTIMATE * exp(c(1, -1) *
	qnorm((1 - conf.level) / 2) * sd)
	})
	RVAL <- list(statistic = STATISTIC,
	parameter = PARAMETER,
	p.value = PVAL)
	}
	else {
	## Exact inference for the 2 x 2 x k case can be carried out
	## conditional on the strata margins, similar to the case
	## for Fisher's exact test (k = 1). Again, the distribution
	## of S (in our case, sum(x[2, 1, ]) to be consistent with
	## the notation in Mehta et al. (1985), is of the form
	## P(S = s) \propto d(s) * or^s, lo <= s <= hi
	## where or is the common odds ratio in the k tables (and
	## d(.) is a product hypergeometric distribution).

	METHOD <- paste("Exact conditional test of independence",
	"in 2 x 2 x k tables")
	mn <- apply(x, c(2L, 3L), sum)
	m <- mn[1L, ]
	n <- mn[2L, ]
	t <- apply(x, c(1L, 3L), sum)[1L, ]
	s <- sum(x[1L, 1L, ])
	lo <- sum(pmax(0, t - n))
	hi <- sum(pmin(m, t))

	support <- lo : hi
	## Density of the central product hypergeometric
	## distribution on its support: store for once as this is
	## needed quite a bit.

	dc <- .Call(C_d2x2xk, K, m, n, t, hi - lo + 1L)
	logdc <- log(dc)

	dn2x2xk <- function(ncp) {
	## Does not work for boundary values for ncp (0, Inf)
	## but it does not need to.
	if(ncp == 1) return(dc)
	d <- logdc + log(ncp) * support
	d <- exp(d - max(d)) # beware of overflow
	d / sum(d)
	}
	mn2x2xk <- function(ncp) {
	if(ncp == 0)
	return(lo)
	if(ncp == Inf)
	return(hi)
	sum(support * dn2x2xk(ncp))
	}
	pn2x2xk <- function(q, ncp = 1, upper.tail = FALSE) {
	if(ncp == 0) {
	as.numeric(if(upper.tail) q <= lo else q >= lo)
	}
	else if(ncp == Inf)
	as.numeric(if(upper.tail) q <= hi else q >= hi)
	else {
	d <- dn2x2xk(ncp)
	sum(d[if(upper.tail) support >= q else support <= q])
	}
	}

	## Determine the p-value.
	PVAL <-
	switch(alternative,
	less = pn2x2xk(s, 1),
	greater = pn2x2xk(s, 1, upper.tail = TRUE),
	two.sided = {
	## Note that we need a little fuzz.
	relErr <- 1 + 10 ^ (-7)
	d <- dc # same as dn2x2xk(1)
	sum(d[d <= d[s - lo + 1] * relErr])
	})

	## Determine the MLE for ncp by solving E(S) = s, where the
	## expectation is with respect to the above distribution.
	mle <- function(x) {
	if(x == lo)
	return(0)
	if(x == hi)
	return(Inf)
	mu <- mn2x2xk(1)
	if(mu > x)
	uniroot(function(t) mn2x2xk(t) - x,
	c(0, 1))$root
	else if(mu < x)
	1 / uniroot(function(t) mn2x2xk(1/t) - x,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	ESTIMATE <- mle(s)

	## Determine confidence intervals for the odds ratio.
	ncp.U <- function(x, alpha) {
	if(x == hi)
	return(Inf)
	p <- pn2x2xk(x, 1)
	if(p < alpha)
	uniroot(function(t) pn2x2xk(x, t) - alpha,
	c(0, 1))$root
	else if(p > alpha)
	1 / uniroot(function(t) pn2x2xk(x, 1/t) - alpha,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	ncp.L <- function(x, alpha) {
	if(x == lo)
	return(0)
	p <- pn2x2xk(x, 1, upper.tail = TRUE)
	if(p > alpha)
	uniroot(function(t)
	pn2x2xk(x, t, upper.tail = TRUE) - alpha,
	c(0, 1))$root
	else if (p < alpha)
	1 / uniroot(function(t)
	pn2x2xk(x, 1/t, upper.tail = TRUE) - alpha,
	c(.Machine$double.eps, 1))$root
	else
	1
	}
	CINT <- switch(alternative,
	less = c(0, ncp.U(s, 1 - conf.level)),
	greater = c(ncp.L(s, 1 - conf.level), Inf),
	two.sided = {
	alpha <- (1 - conf.level) / 2
	c(ncp.L(s, alpha), ncp.U(s, alpha))
	})

	STATISTIC <- c(S = s)
	RVAL <- list(statistic = STATISTIC,
	p.value = PVAL)
	}

	names(ESTIMATE) <- names(NVAL)
	attr(CINT, "conf.level") <- conf.level
	RVAL <- c(RVAL,
	list(conf.int = CINT,
	estimate = ESTIMATE,
	null.value = NVAL,
	alternative = alternative))
	}
	else {
	## Generalized Cochran-Mantel-Haenszel I x J x K test
	## Agresti (1990), pages 234--235.
	## Agresti (2002), pages 295ff.
	## Note that n in the reference is in column-major order.
	## (Thanks to Torsten Hothorn for spotting this.)
	df <- (I - 1) * (J - 1)
	n <- m <- double(length = df)
	V <- matrix(0, nrow = df, ncol = df)
	for (k in 1 : K) {
	f <- x[ , , k] # frequencies in stratum k
	ntot <- as.double(sum(f)) # n_{..k} (and avoid overflow)
	rowsums <- rowSums(f)[-I] # n_{i.k}, i = 1 to I-1
	colsums <- colSums(f)[-J] # n_{.jk}, j = 1 to J-1
	n <- n + c(f[-I, -J])
	m <- m + c(outer(rowsums, colsums, "*")) / ntot
	V <- V + (kronecker(diag(ntot * colsums, nrow = J - 1L)
	- outer(colsums, colsums),
	diag(ntot * rowsums, nrow = I - 1L)
	- outer(rowsums, rowsums))
	/ (ntot^2 * (ntot - 1)))
	}
	n <- n - m
	STATISTIC <- c(crossprod(n, qr.solve(V, n)))
	PARAMETER <- df
	PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
	names(STATISTIC) <- "Cochran-Mantel-Haenszel M^2"
	names(PARAMETER) <- "df"
	METHOD <- "Cochran-Mantel-Haenszel test"
	RVAL <- list(statistic = STATISTIC,
	parameter = PARAMETER,
	p.value = PVAL)
	}
	structure(c(RVAL,
	list(method = METHOD,
	data.name = DNAME)),
	class = "htest")
	}