blob: 586f320406280fc868f410fa1957d3b70873ea6f [file] [log] [blame]
# File src/library/stats/R/cor.test.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2016 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
cor.test <- function(x, ...) UseMethod("cor.test")
cor.test.default <-
function(x, y, alternative = c("two.sided", "less", "greater"),
method = c("pearson", "kendall", "spearman"), exact = NULL,
conf.level = 0.95, continuity = FALSE, ...)
{
alternative <- match.arg(alternative)
method <- match.arg(method)
DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
if(length(x) != length(y))
stop("'x' and 'y' must have the same length")
if(!is.numeric(x)) stop("'x' must be a numeric vector")
if(!is.numeric(y)) stop("'y' must be a numeric vector")
OK <- complete.cases(x, y)
x <- x[OK]
y <- y[OK]
n <- length(x)
NVAL <- 0
conf.int <- FALSE
if(method == "pearson") {
if(n < 3L)
stop("not enough finite observations")
method <- "Pearson's product-moment correlation"
names(NVAL) <- "correlation"
r <- cor(x, y)
df <- n - 2L
ESTIMATE <- c(cor = r)
PARAMETER <- c(df = df)
STATISTIC <- c(t = sqrt(df) * r / sqrt(1 - r^2))
if(n > 3) { ## confidence int.
if(!missing(conf.level) &&
(length(conf.level) != 1 || !is.finite(conf.level) ||
conf.level < 0 || conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
conf.int <- TRUE
z <- atanh(r)
sigma <- 1 / sqrt(n - 3)
cint <-
switch(alternative,
less = c(-Inf, z + sigma * qnorm(conf.level)),
greater = c(z - sigma * qnorm(conf.level), Inf),
two.sided = z +
c(-1, 1) * sigma * qnorm((1 + conf.level) / 2))
cint <- tanh(cint)
attr(cint, "conf.level") <- conf.level
}
PVAL <- switch(alternative,
"less" = pt(STATISTIC, df),
"greater" = pt(STATISTIC, df, lower.tail=FALSE),
"two.sided" = 2 * min(pt(STATISTIC, df),
pt(STATISTIC, df, lower.tail=FALSE)))
}
else {
if(n < 2)
stop("not enough finite observations")
PARAMETER <- NULL
TIES <- (min(length(unique(x)), length(unique(y))) < n)
if(method == "kendall") {
method <- "Kendall's rank correlation tau"
names(NVAL) <- "tau"
r <- cor(x,y, method = "kendall")
ESTIMATE <- c(tau = r)
if(!is.finite(ESTIMATE)) { # all x or all y the same
ESTIMATE[] <- NA
STATISTIC <- c(T = NA)
PVAL <- NA
}
else {
if(is.null(exact))
exact <- (n < 50)
if(exact && !TIES) {
q <- round((r + 1) * n * (n - 1) / 4)
STATISTIC <- c(T = q)
pkendall <- function(q, n) .Call(C_pKendall, q, n)
PVAL <-
switch(alternative,
"two.sided" = {
if(q > n * (n - 1) / 4)
p <- 1 - pkendall(q - 1, n)
else
p <- pkendall(q, n)
min(2 * p, 1)
},
"greater" = 1 - pkendall(q - 1, n),
"less" = pkendall(q, n))
} else {
xties <- table(x[duplicated(x)]) + 1
yties <- table(y[duplicated(y)]) + 1
T0 <- n * (n - 1)/2
T1 <- sum(xties * (xties - 1))/2
T2 <- sum(yties * (yties - 1))/2
S <- r * sqrt((T0 - T1) * (T0 - T2))
v0 <- n * (n - 1) * (2 * n + 5)
vt <- sum(xties * (xties - 1) * (2 * xties + 5))
vu <- sum(yties * (yties - 1) * (2 * yties + 5))
v1 <- sum(xties * (xties - 1)) * sum(yties * (yties - 1))
v2 <- sum(xties * (xties - 1) * (xties - 2)) *
sum(yties * (yties - 1) * (yties - 2))
var_S <- (v0 - vt - vu) / 18 +
v1 / (2 * n * (n - 1)) +
v2 / (9 * n * (n - 1) * (n - 2))
if(exact && TIES)
warning("Cannot compute exact p-value with ties")
if (continuity) S <- sign(S) * (abs(S) - 1)
STATISTIC <- c(z = S / sqrt(var_S))
PVAL <- switch(alternative,
"less" = pnorm(STATISTIC),
"greater" = pnorm(STATISTIC, lower.tail=FALSE),
"two.sided" = 2 * min(pnorm(STATISTIC),
pnorm(STATISTIC, lower.tail=FALSE)))
}
}
} else {
method <- "Spearman's rank correlation rho"
if (is.null(exact))
exact <- TRUE
names(NVAL) <- "rho"
r <- cor(rank(x), rank(y))
ESTIMATE <- c(rho = r)
if(!is.finite(ESTIMATE)) { # all x or all y the same
ESTIMATE[] <- NA
STATISTIC <- c(S = NA)
PVAL <- NA
}
else {
## Use the test statistic S = sum(rank(x) - rank(y))^2
## and AS 89 for obtaining better p-values than via the
## simple normal approximation.
## In the case of no ties, S = (1-rho) * (n^3-n)/6.
pspearman <- function(q, n, lower.tail = TRUE) {
if(n <= 1290 && exact) # n*(n^2 - 1) does not overflow
.Call(C_pRho, round(q) + 2*lower.tail, n, lower.tail)
else { # for large n: asymptotic t_{n-2}
den <- (n*(n^2-1))/6 # careful for overflow
## Kendall et all (1939) p. 260
if (continuity) den <- den + 1
r <- 1 - q/den
pt(r / sqrt((1 - r^2)/(n-2)), df = n-2,
lower.tail = !lower.tail)
}
}
q <- (n^3 - n) * (1 - r) / 6
STATISTIC <- c(S = q)
if(TIES && exact){
exact <- FALSE
warning("Cannot compute exact p-value with ties")
}
PVAL <-
switch(alternative,
"two.sided" = {
p <- if(q > (n^3 - n) / 6)
pspearman(q, n, lower.tail = FALSE)
else
pspearman(q, n, lower.tail = TRUE)
min(2 * p, 1)
},
"greater" = pspearman(q, n, lower.tail = TRUE),
"less" = pspearman(q, n, lower.tail = FALSE))
}
}
}
RVAL <- list(statistic = STATISTIC,
parameter = PARAMETER,
p.value = as.numeric(PVAL),
estimate = ESTIMATE,
null.value = NVAL,
alternative = alternative,
method = method,
data.name = DNAME)
if(conf.int)
RVAL <- c(RVAL, list(conf.int = cint))
class(RVAL) <- "htest"
RVAL
}
cor.test.formula <-
function(formula, data, subset, na.action, ...)
{
if(missing(formula)
|| !inherits(formula, "formula")
|| length(formula) != 2L)
stop("'formula' missing or invalid")
m <- match.call(expand.dots = FALSE)
if(is.matrix(eval(m$data, parent.frame())))
m$data <- as.data.frame(data)
## need stats:: for non-standard evaluation
m[[1L]] <- quote(stats::model.frame)
m$... <- NULL
mf <- eval(m, environment(formula))
if(length(mf) != 2L)
stop("invalid formula")
DNAME <- paste(names(mf), collapse = " and ")
names(mf) <- c("x", "y")
y <- do.call("cor.test", c(mf, list(...)))
y$data.name <- DNAME
y
}