| # File src/library/stats/R/mantelhaen.test.R |
| # Part of the R package, https://www.R-project.org |
| # |
| # Copyright (C) 1995-2018 The R Core Team |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # A copy of the GNU General Public License is available at |
| # https://www.R-project.org/Licenses/ |
| |
| mantelhaen.test <- |
| function(x, y = NULL, z = NULL, |
| alternative = c("two.sided", "less", "greater"), |
| correct = TRUE, exact = FALSE, conf.level = 0.95) |
| { |
| DNAME <- deparse(substitute(x)) |
| if(is.array(x)) { |
| if(length(dim(x)) == 3L) { |
| if(anyNA(x)) stop("NAs are not allowed") |
| if(any(dim(x) < 2L)) stop("each dimension in table must be >= 2") |
| } |
| else |
| stop("'x' must be a 3-dimensional array") |
| } |
| else { |
| if(is.null(y)) stop("if 'x' is not an array, 'y' must be given") |
| if(is.null(z)) stop("if 'x' is not an array, 'z' must be given") |
| if(any(diff(c(length(x), length(y), length(z))) != 0L )) |
| stop("'x', 'y', and 'z' must have the same length") |
| DNAME <- paste(DNAME, "and", deparse(substitute(y)), "and", |
| deparse(substitute(z))) |
| OK <- complete.cases(x, y, z) |
| x <- factor(x[OK]) |
| y <- factor(y[OK]) |
| if((nlevels(x) < 2L) || (nlevels(y) < 2L)) |
| stop("'x' and 'y' must have at least 2 levels") |
| else |
| x <- table(x, y, z[OK]) |
| } |
| |
| if(any(apply(x, 3L, sum) < 2)) |
| stop("sample size in each stratum must be > 1") |
| |
| I <- dim(x)[1L] |
| J <- dim(x)[2L] |
| K <- dim(x)[3L] |
| |
| if((I == 2) && (J == 2)) { |
| ## 2 x 2 x K case |
| alternative <- match.arg(alternative) |
| if(!missing(conf.level) && |
| (length(conf.level) != 1 || !is.finite(conf.level) || |
| conf.level < 0 || conf.level > 1)) |
| stop("'conf.level' must be a single number between 0 and 1") |
| |
| NVAL <- c("common odds ratio" = 1) |
| |
| if(!exact) { |
| ## Classical Mantel-Haenszel 2 x 2 x K test |
| s.x <- apply(x, c(1L, 3L), sum) |
| s.y <- apply(x, c(2L, 3L), sum) |
| n <- as.double(apply(x, 3L, sum)) # avoid overflows below |
| DELTA <- sum(x[1, 1, ] - s.x[1, ] * s.y[1, ] / n) |
| YATES <- if(correct && (abs(DELTA) >= .5)) .5 else 0 |
| STATISTIC <- ((abs(DELTA) - YATES)^2 / |
| sum(apply(rbind(s.x, s.y), 2L, prod) |
| / (n^2 * (n - 1)))) |
| PARAMETER <- 1 |
| if (alternative == "two.sided") |
| PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE) |
| else { |
| z <- sign(DELTA) * sqrt(STATISTIC) |
| PVAL <- pnorm(z, lower.tail = (alternative == "less")) |
| } |
| |
| names(STATISTIC) <- "Mantel-Haenszel X-squared" |
| names(PARAMETER) <- "df" |
| METHOD <- paste("Mantel-Haenszel chi-squared test", |
| if(YATES) "with" else "without", |
| "continuity correction") |
| s.diag <- sum(x[1L, 1L, ] * x[2L, 2L, ] / n) |
| s.offd <- sum(x[1L, 2L, ] * x[2L, 1L, ] / n) |
| ## Mantel-Haenszel (1959) estimate of the common odds ratio. |
| ESTIMATE <- s.diag / s.offd |
| ## Robins et al. (1986) estimate of the standard deviation |
| ## of the log of the Mantel-Haenszel estimator. |
| sd <- |
| sqrt( sum((x[1L,1L,] + x[2L,2L,]) * x[1L,1L,] * x[2L,2L,] |
| / n^2) |
| / (2 * s.diag^2) |
| + sum(( (x[1L,1L,] + x[2L,2L,]) * x[1L,2L,] * x[2L,1L,] |
| + (x[1L,2L,] + x[2L,1L,]) * x[1L,1L,] * x[2L,2L,]) |
| / n^2) |
| / (2 * s.diag * s.offd) |
| + sum((x[1L,2L,] + x[2L,1L,]) * x[1L,2L,] * x[2L,1L,] |
| / n^2) |
| / (2 * s.offd^2)) |
| CINT <- |
| switch(alternative, |
| less = c(0, ESTIMATE * exp(qnorm(conf.level) * sd)), |
| greater = c(ESTIMATE * exp(qnorm(conf.level, |
| lower.tail = FALSE) * sd), Inf), |
| two.sided = { |
| ESTIMATE * exp(c(1, -1) * |
| qnorm((1 - conf.level) / 2) * sd) |
| }) |
| RVAL <- list(statistic = STATISTIC, |
| parameter = PARAMETER, |
| p.value = PVAL) |
| } |
| else { |
| ## Exact inference for the 2 x 2 x k case can be carried out |
| ## conditional on the strata margins, similar to the case |
| ## for Fisher's exact test (k = 1). Again, the distribution |
| ## of S (in our case, sum(x[2, 1, ]) to be consistent with |
| ## the notation in Mehta et al. (1985), is of the form |
| ## P(S = s) \propto d(s) * or^s, lo <= s <= hi |
| ## where or is the common odds ratio in the k tables (and |
| ## d(.) is a product hypergeometric distribution). |
| |
| METHOD <- paste("Exact conditional test of independence", |
| "in 2 x 2 x k tables") |
| mn <- apply(x, c(2L, 3L), sum) |
| m <- mn[1L, ] |
| n <- mn[2L, ] |
| t <- apply(x, c(1L, 3L), sum)[1L, ] |
| s <- sum(x[1L, 1L, ]) |
| lo <- sum(pmax(0, t - n)) |
| hi <- sum(pmin(m, t)) |
| |
| support <- lo : hi |
| ## Density of the *central* product hypergeometric |
| ## distribution on its support: store for once as this is |
| ## needed quite a bit. |
| |
| dc <- .Call(C_d2x2xk, K, m, n, t, hi - lo + 1L) |
| logdc <- log(dc) |
| |
| dn2x2xk <- function(ncp) { |
| ## Does not work for boundary values for ncp (0, Inf) |
| ## but it does not need to. |
| if(ncp == 1) return(dc) |
| d <- logdc + log(ncp) * support |
| d <- exp(d - max(d)) # beware of overflow |
| d / sum(d) |
| } |
| mn2x2xk <- function(ncp) { |
| if(ncp == 0) |
| return(lo) |
| if(ncp == Inf) |
| return(hi) |
| sum(support * dn2x2xk(ncp)) |
| } |
| pn2x2xk <- function(q, ncp = 1, upper.tail = FALSE) { |
| if(ncp == 0) { |
| as.numeric(if(upper.tail) q <= lo else q >= lo) |
| } |
| else if(ncp == Inf) |
| as.numeric(if(upper.tail) q <= hi else q >= hi) |
| else { |
| d <- dn2x2xk(ncp) |
| sum(d[if(upper.tail) support >= q else support <= q]) |
| } |
| } |
| |
| ## Determine the p-value. |
| PVAL <- |
| switch(alternative, |
| less = pn2x2xk(s, 1), |
| greater = pn2x2xk(s, 1, upper.tail = TRUE), |
| two.sided = { |
| ## Note that we need a little fuzz. |
| relErr <- 1 + 10 ^ (-7) |
| d <- dc # same as dn2x2xk(1) |
| sum(d[d <= d[s - lo + 1] * relErr]) |
| }) |
| |
| ## Determine the MLE for ncp by solving E(S) = s, where the |
| ## expectation is with respect to the above distribution. |
| mle <- function(x) { |
| if(x == lo) |
| return(0) |
| if(x == hi) |
| return(Inf) |
| mu <- mn2x2xk(1) |
| if(mu > x) |
| uniroot(function(t) mn2x2xk(t) - x, |
| c(0, 1))$root |
| else if(mu < x) |
| 1 / uniroot(function(t) mn2x2xk(1/t) - x, |
| c(.Machine$double.eps, 1))$root |
| else |
| 1 |
| } |
| ESTIMATE <- mle(s) |
| |
| ## Determine confidence intervals for the odds ratio. |
| ncp.U <- function(x, alpha) { |
| if(x == hi) |
| return(Inf) |
| p <- pn2x2xk(x, 1) |
| if(p < alpha) |
| uniroot(function(t) pn2x2xk(x, t) - alpha, |
| c(0, 1))$root |
| else if(p > alpha) |
| 1 / uniroot(function(t) pn2x2xk(x, 1/t) - alpha, |
| c(.Machine$double.eps, 1))$root |
| else |
| 1 |
| } |
| ncp.L <- function(x, alpha) { |
| if(x == lo) |
| return(0) |
| p <- pn2x2xk(x, 1, upper.tail = TRUE) |
| if(p > alpha) |
| uniroot(function(t) |
| pn2x2xk(x, t, upper.tail = TRUE) - alpha, |
| c(0, 1))$root |
| else if (p < alpha) |
| 1 / uniroot(function(t) |
| pn2x2xk(x, 1/t, upper.tail = TRUE) - alpha, |
| c(.Machine$double.eps, 1))$root |
| else |
| 1 |
| } |
| CINT <- switch(alternative, |
| less = c(0, ncp.U(s, 1 - conf.level)), |
| greater = c(ncp.L(s, 1 - conf.level), Inf), |
| two.sided = { |
| alpha <- (1 - conf.level) / 2 |
| c(ncp.L(s, alpha), ncp.U(s, alpha)) |
| }) |
| |
| STATISTIC <- c(S = s) |
| RVAL <- list(statistic = STATISTIC, |
| p.value = PVAL) |
| } |
| |
| names(ESTIMATE) <- names(NVAL) |
| attr(CINT, "conf.level") <- conf.level |
| RVAL <- c(RVAL, |
| list(conf.int = CINT, |
| estimate = ESTIMATE, |
| null.value = NVAL, |
| alternative = alternative)) |
| } |
| else { |
| ## Generalized Cochran-Mantel-Haenszel I x J x K test |
| ## Agresti (1990), pages 234--235. |
| ## Agresti (2002), pages 295ff. |
| ## Note that n in the reference is in column-major order. |
| ## (Thanks to Torsten Hothorn for spotting this.) |
| df <- (I - 1) * (J - 1) |
| n <- m <- double(length = df) |
| V <- matrix(0, nrow = df, ncol = df) |
| for (k in 1 : K) { |
| f <- x[ , , k] # frequencies in stratum k |
| ntot <- as.double(sum(f)) # n_{..k} (and avoid overflow) |
| rowsums <- rowSums(f)[-I] # n_{i.k}, i = 1 to I-1 |
| colsums <- colSums(f)[-J] # n_{.jk}, j = 1 to J-1 |
| n <- n + c(f[-I, -J]) |
| m <- m + c(outer(rowsums, colsums, "*")) / ntot |
| V <- V + (kronecker(diag(ntot * colsums, nrow = J - 1L) |
| - outer(colsums, colsums), |
| diag(ntot * rowsums, nrow = I - 1L) |
| - outer(rowsums, rowsums)) |
| / (ntot^2 * (ntot - 1))) |
| } |
| n <- n - m |
| STATISTIC <- c(crossprod(n, qr.solve(V, n))) |
| PARAMETER <- df |
| PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE) |
| names(STATISTIC) <- "Cochran-Mantel-Haenszel M^2" |
| names(PARAMETER) <- "df" |
| METHOD <- "Cochran-Mantel-Haenszel test" |
| RVAL <- list(statistic = STATISTIC, |
| parameter = PARAMETER, |
| p.value = PVAL) |
| } |
| structure(c(RVAL, |
| list(method = METHOD, |
| data.name = DNAME)), |
| class = "htest") |
| } |