| # File src/library/stats/R/wilcox.test.R |
| # Part of the R package, https://www.R-project.org |
| # |
| # Copyright (C) 1995-2016 The R Core Team |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # A copy of the GNU General Public License is available at |
| # https://www.R-project.org/Licenses/ |
| |
| wilcox.test <- function(x, ...) UseMethod("wilcox.test") |
| |
| wilcox.test.default <- |
| function(x, y = NULL, alternative = c("two.sided", "less", "greater"), |
| mu = 0, paired = FALSE, exact = NULL, correct = TRUE, |
| conf.int = FALSE, conf.level = 0.95, ...) |
| { |
| alternative <- match.arg(alternative) |
| if(!missing(mu) && ((length(mu) > 1L) || !is.finite(mu))) |
| stop("'mu' must be a single number") |
| if(conf.int) { |
| if(!((length(conf.level) == 1L) |
| && is.finite(conf.level) |
| && (conf.level > 0) |
| && (conf.level < 1))) |
| stop("'conf.level' must be a single number between 0 and 1") |
| } |
| |
| if(!is.numeric(x)) stop("'x' must be numeric") |
| if(!is.null(y)) { |
| if(!is.numeric(y)) stop("'y' must be numeric") |
| DNAME <- paste(deparse(substitute(x)), "and", |
| deparse(substitute(y))) |
| if(paired) { |
| if(length(x) != length(y)) |
| stop("'x' and 'y' must have the same length") |
| OK <- complete.cases(x, y) |
| x <- x[OK] - y[OK] |
| y <- NULL |
| } |
| else { |
| x <- x[is.finite(x)] |
| y <- y[is.finite(y)] |
| } |
| } else { |
| DNAME <- deparse(substitute(x)) |
| if(paired) |
| stop("'y' is missing for paired test") |
| x <- x[is.finite(x)] |
| } |
| |
| if(length(x) < 1L) |
| stop("not enough (finite) 'x' observations") |
| CORRECTION <- 0 |
| if(is.null(y)) { |
| METHOD <- "Wilcoxon signed rank test" |
| x <- x - mu |
| ZEROES <- any(x == 0) |
| if(ZEROES) |
| x <- x[x != 0] |
| n <- as.double(length(x)) |
| if(is.null(exact)) |
| exact <- (n < 50) |
| r <- rank(abs(x)) |
| STATISTIC <- setNames(sum(r[x > 0]), "V") |
| TIES <- length(r) != length(unique(r)) |
| |
| if(exact && !TIES && !ZEROES) { |
| PVAL <- |
| switch(alternative, |
| "two.sided" = { |
| p <- if(STATISTIC > (n * (n + 1) / 4)) |
| psignrank(STATISTIC - 1, n, lower.tail = FALSE) |
| else psignrank(STATISTIC, n) |
| min(2 * p, 1) |
| }, |
| "greater" = psignrank(STATISTIC - 1, n, lower.tail = FALSE), |
| "less" = psignrank(STATISTIC, n)) |
| if(conf.int) { |
| ## Exact confidence interval for the median in the |
| ## one-sample case. When used with paired values this |
| ## gives a confidence interval for mean(x) - mean(y). |
| x <- x + mu # we want a conf.int for the median |
| alpha <- 1 - conf.level |
| diffs <- outer(x, x, "+") |
| diffs <- sort(diffs[!lower.tri(diffs)]) / 2 |
| cint <- |
| switch(alternative, |
| "two.sided" = { |
| qu <- qsignrank(alpha / 2, n) |
| if(qu == 0) qu <- 1 |
| ql <- n*(n+1)/2 - qu |
| achieved.alpha <- 2*psignrank(trunc(qu)-1,n) |
| c(diffs[qu], diffs[ql+1]) |
| }, |
| "greater" = { |
| qu <- qsignrank(alpha, n) |
| if(qu == 0) qu <- 1 |
| achieved.alpha <- psignrank(trunc(qu)-1,n) |
| c(diffs[qu], +Inf) |
| }, |
| "less" = { |
| qu <- qsignrank(alpha, n) |
| if(qu == 0) qu <- 1 |
| ql <- n*(n+1)/2 - qu |
| achieved.alpha <- psignrank(trunc(qu)-1,n) |
| c(-Inf, diffs[ql+1]) |
| }) |
| if (achieved.alpha - alpha > alpha/2){ |
| warning("requested conf.level not achievable") |
| conf.level <- 1 - signif(achieved.alpha, 2) |
| } |
| attr(cint, "conf.level") <- conf.level |
| ESTIMATE <- c("(pseudo)median" = median(diffs)) |
| } |
| } else { ## not exact, maybe ties or zeroes |
| NTIES <- table(r) |
| z <- STATISTIC - n * (n + 1)/4 |
| SIGMA <- sqrt(n * (n + 1) * (2 * n + 1) / 24 |
| - sum(NTIES^3 - NTIES) / 48) |
| if(correct) { |
| CORRECTION <- |
| switch(alternative, |
| "two.sided" = sign(z) * 0.5, |
| "greater" = 0.5, |
| "less" = -0.5) |
| METHOD <- paste(METHOD, "with continuity correction") |
| } |
| z <- (z - CORRECTION) / SIGMA |
| PVAL <- switch(alternative, |
| "less" = pnorm(z), |
| "greater" = pnorm(z, lower.tail=FALSE), |
| "two.sided" = 2 * min(pnorm(z), |
| pnorm(z, lower.tail=FALSE))) |
| if(conf.int) { |
| ## Asymptotic confidence interval for the median in the |
| ## one-sample case. When used with paired values this |
| ## gives a confidence interval for mean(x) - mean(y). |
| ## Algorithm not published, thus better documented here. |
| x <- x + mu |
| alpha <- 1 - conf.level |
| if(n > 0) { |
| ## These are sample based limits for the median |
| ## [They don't work if alpha is too high] |
| mumin <- min(x) |
| mumax <- max(x) |
| ## wdiff(d, zq) returns the absolute difference between |
| ## the asymptotic Wilcoxon statistic of x - mu - d and |
| ## the quantile zq. |
| W <- function(d) { ## also fn(x, correct, alternative) |
| xd <- x - d |
| xd <- xd[xd != 0] |
| nx <- length(xd) |
| dr <- rank(abs(xd)) |
| zd <- sum(dr[xd > 0]) - nx * (nx + 1)/4 |
| NTIES.CI <- table(dr) |
| SIGMA.CI <- sqrt(nx * (nx + 1) * (2 * nx + 1) / 24 |
| - sum(NTIES.CI^3 - NTIES.CI) / 48) |
| if (SIGMA.CI == 0) |
| warning( |
| "cannot compute confidence interval when all observations are zero or tied", |
| call.=FALSE) |
| CORRECTION.CI <- |
| if(correct) { |
| switch(alternative, |
| "two.sided" = sign(zd) * 0.5, |
| "greater" = 0.5, |
| "less" = -0.5) |
| } else 0 |
| (zd - CORRECTION.CI) / SIGMA.CI |
| } |
| Wmumin <- W(mumin) |
| Wmumax <- if(!is.finite(Wmumin)) NA else W(mumax) # if(): warn only once |
| } |
| if(n == 0 || !is.finite(Wmumax)) { # incl. "all zero / ties" warning above |
| cint <- structure(c(if(alternative == "less" ) -Inf else NaN, |
| if(alternative == "greater") +Inf else NaN), |
| conf.level = 0) |
| ESTIMATE <- if(n > 0) c(midrange = (mumin+mumax)/2) else NaN |
| } else { # (Wmumin, Wmumax) are finite |
| wdiff <- function(d, zq) W(d) - zq |
| ## Here we optimize the function wdiff in d over the set |
| ## c(mumin, mumax). |
| ## This returns a value from c(mumin, mumax) for which |
| ## the asymptotic Wilcoxon statistic is equal to the |
| ## quantile zq. This means that the statistic is not |
| ## within the critical region, and that implies that d |
| ## is a confidence limit for the median. |
| ## |
| ## As in the exact case, interchange quantiles. |
| root <- function(zq) { |
| uniroot(wdiff, lower = mumin, upper = mumax, |
| f.lower = Wmumin - zq, f.upper = Wmumax - zq, |
| tol = 1e-4, zq = zq)$root |
| } |
| |
| cint <- switch(alternative, "two.sided" = { |
| repeat { ## FIXME: no need to loop for finding boundary alpha !! |
| mindiff <- Wmumin - qnorm(alpha/2, lower.tail = FALSE) |
| maxdiff <- Wmumax - qnorm(alpha/2) |
| if(mindiff < 0 || maxdiff > 0) alpha <- alpha*2 else break |
| } |
| if (alpha >= 1 || 1 - conf.level < alpha*0.75) { |
| conf.level <- 1 - pmin(1, alpha) |
| warning("requested conf.level not achievable") |
| } |
| if(alpha < 1) { |
| l <- root(zq = qnorm(alpha/2, lower.tail = FALSE)) |
| u <- root(zq = qnorm(alpha/2)) |
| c(l, u) |
| } else { ## alpha >= 1 |
| rep(median(x), 2) |
| } |
| }, "greater" = { |
| repeat { ## FIXME: no need to loop for finding boundary alpha !! |
| mindiff <- Wmumin - qnorm(alpha, lower.tail = FALSE) |
| if(mindiff < 0) alpha <- alpha*2 else break |
| } |
| if (alpha >= 1 || 1 - conf.level < alpha*0.75) { |
| conf.level <- 1 - pmin(1, alpha) |
| warning("requested conf.level not achievable") |
| } |
| l <- if(alpha < 1) |
| root(zq = qnorm(alpha, lower.tail = FALSE)) |
| else ## alpha >= 1 |
| median(x) |
| c(l, +Inf) |
| |
| }, "less" = { |
| repeat { ## FIXME: no need to loop for finding boundary alpha !! |
| maxdiff <- Wmumax - qnorm(alpha/2) |
| if(maxdiff > 0) alpha <- alpha * 2 else break |
| } |
| if (alpha >= 1 || 1 - conf.level < alpha*0.75) { |
| conf.level <- 1 - pmin(1, alpha) |
| warning("requested conf.level not achievable") |
| } |
| u <- if(alpha < 1) |
| root(zq = qnorm(alpha)) |
| else |
| median(x) |
| c(-Inf, u) |
| }) |
| attr(cint, "conf.level") <- conf.level |
| correct <- FALSE # for W(): no continuity correction for estimate |
| ESTIMATE <- c("(pseudo)median" = |
| uniroot(W, lower = mumin, upper = mumax, |
| tol = 1e-4)$root) |
| } # regular (Wmumin, Wmumax) |
| } # end{conf.int} |
| if(exact && TIES) { |
| warning("cannot compute exact p-value with ties") |
| if(conf.int) |
| warning("cannot compute exact confidence interval with ties") |
| } |
| if(exact && ZEROES) { |
| warning("cannot compute exact p-value with zeroes") |
| if(conf.int) |
| warning("cannot compute exact confidence interval with zeroes") |
| } |
| } |
| } |
| else { ##-------------------------- 2-sample case --------------------------- |
| if(length(y) < 1L) |
| stop("not enough 'y' observations") |
| METHOD <- "Wilcoxon rank sum test" |
| r <- rank(c(x - mu, y)) |
| n.x <- as.double(length(x)) |
| n.y <- as.double(length(y)) |
| if(is.null(exact)) |
| exact <- (n.x < 50) && (n.y < 50) |
| STATISTIC <- c("W" = sum(r[seq_along(x)]) - n.x * (n.x + 1) / 2) |
| TIES <- (length(r) != length(unique(r))) |
| if(exact && !TIES) { |
| PVAL <- |
| switch(alternative, |
| "two.sided" = { |
| p <- if(STATISTIC > (n.x * n.y / 2)) |
| pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE) |
| else |
| pwilcox(STATISTIC, n.x, n.y) |
| min(2 * p, 1) |
| }, |
| "greater" = { |
| pwilcox(STATISTIC - 1, n.x, n.y, lower.tail = FALSE) |
| }, |
| "less" = pwilcox(STATISTIC, n.x, n.y)) |
| if(conf.int) { |
| ## Exact confidence interval for the location parameter |
| ## mean(x) - mean(y) in the two-sample case (cf. the |
| ## one-sample case). |
| alpha <- 1 - conf.level |
| diffs <- sort(outer(x, y, "-")) |
| cint <- |
| switch(alternative, |
| "two.sided" = { |
| qu <- qwilcox(alpha/2, n.x, n.y) |
| if(qu == 0) qu <- 1 |
| ql <- n.x*n.y - qu |
| achieved.alpha <- 2*pwilcox(trunc(qu)-1,n.x,n.y) |
| c(diffs[qu], diffs[ql + 1]) |
| }, |
| "greater" = { |
| qu <- qwilcox(alpha, n.x, n.y) |
| if(qu == 0) qu <- 1 |
| achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y) |
| c(diffs[qu], +Inf) |
| }, |
| "less" = { |
| qu <- qwilcox(alpha, n.x, n.y) |
| if(qu == 0) qu <- 1 |
| ql <- n.x*n.y - qu |
| achieved.alpha <- pwilcox(trunc(qu)-1,n.x,n.y) |
| c(-Inf, diffs[ql + 1]) |
| }) |
| if (achieved.alpha-alpha > alpha/2) { |
| warning("Requested conf.level not achievable") |
| conf.level <- 1 - achieved.alpha |
| } |
| attr(cint, "conf.level") <- conf.level |
| ESTIMATE <- c("difference in location" = median(diffs)) |
| } |
| } |
| else { ## not exact, maybe ties or zeroes |
| NTIES <- table(r) |
| z <- STATISTIC - n.x * n.y / 2 |
| SIGMA <- sqrt((n.x * n.y / 12) * |
| ((n.x + n.y + 1) |
| - sum(NTIES^3 - NTIES) |
| / ((n.x + n.y) * (n.x + n.y - 1)))) |
| if(correct) { |
| CORRECTION <- switch(alternative, |
| "two.sided" = sign(z) * 0.5, |
| "greater" = 0.5, |
| "less" = -0.5) |
| METHOD <- paste(METHOD, "with continuity correction") |
| } |
| z <- (z - CORRECTION) / SIGMA |
| PVAL <- switch(alternative, |
| "less" = pnorm(z), |
| "greater" = pnorm(z, lower.tail=FALSE), |
| "two.sided" = 2 * min(pnorm(z), |
| pnorm(z, lower.tail=FALSE))) |
| if(conf.int) { |
| ## Asymptotic confidence interval for the location |
| ## parameter mean(x) - mean(y) in the two-sample case |
| ## (cf. one-sample case). |
| ## |
| ## Algorithm not published, for a documentation see the |
| ## one-sample case. |
| alpha <- 1 - conf.level |
| mumin <- min(x) - max(y) |
| mumax <- max(x) - min(y) |
| W <- function(d) { ## also fn (x, y, n.x, n.y, correct, alternative) |
| dr <- rank(c(x - d, y)) |
| NTIES.CI <- table(dr) |
| dz <- sum(dr[seq_along(x)]) - n.x * (n.x + 1) / 2 - n.x * n.y / 2 |
| CORRECTION.CI <- |
| if(correct) { |
| switch(alternative, |
| "two.sided" = sign(dz) * 0.5, |
| "greater" = 0.5, |
| "less" = -0.5) |
| } else 0 |
| SIGMA.CI <- sqrt((n.x * n.y / 12) * |
| ((n.x + n.y + 1) |
| - sum(NTIES.CI^3 - NTIES.CI) |
| / ((n.x + n.y) * (n.x + n.y - 1)))) |
| if (SIGMA.CI == 0) |
| warning( |
| "cannot compute confidence interval when all observations are tied", |
| call.=FALSE) |
| (dz - CORRECTION.CI) / SIGMA.CI |
| } |
| wdiff <- function(d, zq) W(d) - zq |
| Wmumin <- W(mumin) |
| Wmumax <- W(mumax) |
| root <- function(zq) { |
| ## in extreme cases we need to return endpoints, |
| ## e.g. wilcox.test(1, 2:60, conf.int=TRUE) |
| f.lower <- Wmumin - zq |
| if(f.lower <= 0) return(mumin) |
| f.upper <- Wmumax - zq |
| if(f.upper >= 0) return(mumax) |
| uniroot(wdiff, lower=mumin, upper=mumax, |
| f.lower = f.lower, f.upper = f.upper, |
| tol = 1e-4, zq = zq)$root |
| } |
| cint <- switch(alternative, |
| "two.sided" = { |
| l <- root(zq = qnorm(alpha/2, lower.tail = FALSE)) |
| u <- root(zq = qnorm(alpha/2)) |
| c(l, u) |
| }, |
| "greater" = { |
| l <- root(zq = qnorm(alpha, lower.tail = FALSE)) |
| c(l, +Inf) |
| }, |
| "less" = { |
| u <- root(zq = qnorm(alpha)) |
| c(-Inf, u) |
| }) |
| attr(cint, "conf.level") <- conf.level |
| correct <- FALSE # for W(): no continuity correction for estimate |
| ESTIMATE <- c("difference in location" = |
| uniroot(W, lower=mumin, upper=mumax, |
| tol = 1e-4)$root) |
| } |
| |
| if(exact && TIES) { |
| warning("cannot compute exact p-value with ties") |
| if(conf.int) |
| warning("cannot compute exact confidence intervals with ties") |
| } |
| } |
| } |
| |
| names(mu) <- if(paired || !is.null(y)) "location shift" else "location" |
| RVAL <- list(statistic = STATISTIC, |
| parameter = NULL, |
| p.value = as.numeric(PVAL), |
| null.value = mu, |
| alternative = alternative, |
| method = METHOD, |
| data.name = DNAME) |
| if(conf.int) |
| RVAL <- c(RVAL, |
| list(conf.int = cint, |
| estimate = ESTIMATE)) |
| class(RVAL) <- "htest" |
| RVAL |
| } |
| |
| wilcox.test.formula <- |
| function(formula, data, subset, na.action, ...) |
| { |
| if(missing(formula) |
| || (length(formula) != 3L) |
| || (length(attr(terms(formula[-2L]), "term.labels")) != 1L)) |
| stop("'formula' missing or incorrect") |
| m <- match.call(expand.dots = FALSE) |
| if(is.matrix(eval(m$data, parent.frame()))) |
| m$data <- as.data.frame(data) |
| ## need stats:: for non-standard evaluation |
| m[[1L]] <- quote(stats::model.frame) |
| m$... <- NULL |
| mf <- eval(m, parent.frame()) |
| DNAME <- paste(names(mf), collapse = " by ") |
| names(mf) <- NULL |
| response <- attr(attr(mf, "terms"), "response") |
| g <- factor(mf[[-response]]) |
| if(nlevels(g) != 2L) |
| stop("grouping factor must have exactly 2 levels") |
| DATA <- setNames(split(mf[[response]], g), |
| c("x", "y")) |
| y <- do.call("wilcox.test", c(DATA, list(...))) |
| y$data.name <- DNAME |
| y |
| } |