src/library/stats/R/chisq.test.R - R - Git at Google

 #  File src/library/stats/R/chisq.test.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2013 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 chisq.test <- function(x, y = NULL, correct = TRUE,
 		       p = rep(1/length(x), length(x)),
 		       rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
 {
     DNAME <- deparse(substitute(x))
     if (is.data.frame(x))
 	x <- as.matrix(x)
     if (is.matrix(x)) { # why not just drop()?
 	if (min(dim(x)) == 1L)
 	    x <- as.vector(x)
     }
     if (!is.matrix(x) && !is.null(y)) {
 	if (length(x) != length(y))
 	    stop("'x' and 'y' must have the same length")
         DNAME2 <- deparse(substitute(y))
         ## omit names on dims if too long (and 1 line might already be too long)
         xname <- if(length(DNAME) > 1L || nchar(DNAME, "w") > 30) "" else DNAME
         yname <- if(length(DNAME2) > 1L || nchar(DNAME2, "w") > 30) "" else DNAME2
 	OK <- complete.cases(x, y)
 	x <- factor(x[OK])
 	y <- factor(y[OK])
 	if ((nlevels(x) < 2L) || (nlevels(y) < 2L))
 	    stop("'x' and 'y' must have at least 2 levels")
 	## Could also call table() with 'deparse.level = 2', but we need
 	## to deparse ourselves for DNAME anyway ...
 	x <- table(x, y)
 	names(dimnames(x)) <- c(xname, yname)
         ## unclear what to do here: might abbreviating be preferable?
 	DNAME <- paste(paste(DNAME, collapse = "\n"),
                        "and",
                        paste(DNAME2, collapse = "\n"))
     }

     if (any(x < 0) || anyNA(x))
 	stop("all entries of 'x' must be nonnegative and finite")
     if ((n <- sum(x)) == 0)
 	stop("at least one entry of 'x' must be positive")

     if(simulate.p.value) {
 	setMETH <- function() # you shalt not cut_n_paste
 	    METHOD <<- paste(METHOD,
 			     "with simulated p-value\n\t (based on", B,
 			     "replicates)")
         almost.1 <- 1 - 64 * .Machine$double.eps
     }
     if (is.matrix(x)) {
 	METHOD <- "Pearson's Chi-squared test"
         nr <- as.integer(nrow(x))
         nc <- as.integer(ncol(x))
         if (is.na(nr) || is.na(nc) || is.na(nr * nc))
             stop("invalid nrow(x) or ncol(x)", domain = NA)
 	sr <- rowSums(x)
 	sc <- colSums(x)
 	E <- outer(sr, sc, "*") / n

         ## Cell residual variance. Essentially formula (2.9) in Agresti(2007).
         v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3

         V <- outer(sr, sc, v, n)

 	dimnames(E) <- dimnames(x)
 	if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
 	    setMETH()
             tmp <- .Call(C_chisq_sim, sr, sc, B, E)
 	    ## Sorting before summing may look strange, but seems to be
 	    ## a sensible way to deal with rounding issues (PR#3486):
 	    STATISTIC <- sum(sort((x - E) ^ 2 / E, decreasing = TRUE))
 	    PARAMETER <- NA
 	    ## use correct significance level for a Monte Carlo test
 	    PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC)) / (B + 1)
 	}
 	else {
 	    if (simulate.p.value)
 		warning("cannot compute simulated p-value with zero marginals")
 	    if (correct && nrow(x) == 2L && ncol(x) == 2L) {
 		YATES <- min(0.5, abs(x-E))
                 if (YATES > 0)
 		    METHOD <- paste(METHOD, "with Yates' continuity correction")
 	    }
 	    else
 		YATES <- 0
 	    STATISTIC <- sum((abs(x - E) - YATES)^2 / E)
 	    PARAMETER <- (nr - 1L) * (nc - 1L)
 	    PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
 	}
     }
     else {
         if(length(dim(x)) > 2L)
             stop("invalid 'x'")
 	if (length(x) == 1L)
 	    stop("'x' must at least have 2 elements")
 	if (length(x) != length(p))
 	    stop("'x' and 'p' must have the same number of elements")
 	if(any(p < 0)) stop("probabilities must be non-negative.")
 	if(abs(sum(p)-1) > sqrt(.Machine$double.eps)) {
 	    if(rescale.p) p <- p/sum(p)
 	    else stop("probabilities must sum to 1.")
 	}
 	METHOD <- "Chi-squared test for given probabilities"
 	E <- n * p
         V <- n * p * (1 - p)
 	STATISTIC <- sum((x - E) ^ 2 / E)
 	names(E) <- names(x)
 	if(simulate.p.value) {
 	    setMETH()
 	    nx <- length(x)
 	    sm <- matrix(sample.int(nx, B*n, TRUE, prob = p),nrow = n)
 	    ss <- apply(sm, 2L, function(x,E,k) {
 		sum((table(factor(x, levels=1L:k)) - E)^2 / E)
 	    }, E = E, k = nx)
 	    PARAMETER <- NA
 	    PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B + 1)
 	} else {
 	    PARAMETER <- length(x) - 1
 	    PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
 	}
     }

     names(STATISTIC) <- "X-squared"
     names(PARAMETER) <- "df"
     if (any(E < 5) && is.finite(PARAMETER))
 	warning("Chi-squared approximation may be incorrect")

     structure(list(statistic = STATISTIC,
 		   parameter = PARAMETER,
 		   p.value = PVAL,
 		   method = METHOD,
 		   data.name = DNAME,
 		   observed = x,
 		   expected = E,
 		   residuals = (x - E) / sqrt(E),
                    stdres = (x - E) / sqrt(V) ),
 	      class = "htest")
 }
	# File src/library/stats/R/chisq.test.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2013 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	chisq.test <- function(x, y = NULL, correct = TRUE,
	p = rep(1/length(x), length(x)),
	rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
	{
	DNAME <- deparse(substitute(x))
	if (is.data.frame(x))
	x <- as.matrix(x)
	if (is.matrix(x)) { # why not just drop()?
	if (min(dim(x)) == 1L)
	x <- as.vector(x)
	}
	if (!is.matrix(x) && !is.null(y)) {
	if (length(x) != length(y))
	stop("'x' and 'y' must have the same length")
	DNAME2 <- deparse(substitute(y))
	## omit names on dims if too long (and 1 line might already be too long)
	xname <- if(length(DNAME) > 1L \|\| nchar(DNAME, "w") > 30) "" else DNAME
	yname <- if(length(DNAME2) > 1L \|\| nchar(DNAME2, "w") > 30) "" else DNAME2
	OK <- complete.cases(x, y)
	x <- factor(x[OK])
	y <- factor(y[OK])
	if ((nlevels(x) < 2L) \|\| (nlevels(y) < 2L))
	stop("'x' and 'y' must have at least 2 levels")
	## Could also call table() with 'deparse.level = 2', but we need
	## to deparse ourselves for DNAME anyway ...
	x <- table(x, y)
	names(dimnames(x)) <- c(xname, yname)
	## unclear what to do here: might abbreviating be preferable?
	DNAME <- paste(paste(DNAME, collapse = "\n"),
	"and",
	paste(DNAME2, collapse = "\n"))
	}

	if (any(x < 0) \|\| anyNA(x))
	stop("all entries of 'x' must be nonnegative and finite")
	if ((n <- sum(x)) == 0)
	stop("at least one entry of 'x' must be positive")

	if(simulate.p.value) {
	setMETH <- function() # you shalt not cut_n_paste
	METHOD <<- paste(METHOD,
	"with simulated p-value\n\t (based on", B,
	"replicates)")
	almost.1 <- 1 - 64 * .Machine$double.eps
	}
	if (is.matrix(x)) {
	METHOD <- "Pearson's Chi-squared test"
	nr <- as.integer(nrow(x))
	nc <- as.integer(ncol(x))
	if (is.na(nr) \|\| is.na(nc) \|\| is.na(nr * nc))
	stop("invalid nrow(x) or ncol(x)", domain = NA)
	sr <- rowSums(x)
	sc <- colSums(x)
	E <- outer(sr, sc, "*") / n

	## Cell residual variance. Essentially formula (2.9) in Agresti(2007).
	v <- function(r, c, n) c * r * (n - r) * (n - c)/n^3

	V <- outer(sr, sc, v, n)

	dimnames(E) <- dimnames(x)
	if (simulate.p.value && all(sr > 0) && all(sc > 0)) {
	setMETH()
	tmp <- .Call(C_chisq_sim, sr, sc, B, E)
	## Sorting before summing may look strange, but seems to be
	## a sensible way to deal with rounding issues (PR#3486):
	STATISTIC <- sum(sort((x - E) ^ 2 / E, decreasing = TRUE))
	PARAMETER <- NA
	## use correct significance level for a Monte Carlo test
	PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC)) / (B + 1)
	}
	else {
	if (simulate.p.value)
	warning("cannot compute simulated p-value with zero marginals")
	if (correct && nrow(x) == 2L && ncol(x) == 2L) {
	YATES <- min(0.5, abs(x-E))
	if (YATES > 0)
	METHOD <- paste(METHOD, "with Yates' continuity correction")
	}
	else
	YATES <- 0
	STATISTIC <- sum((abs(x - E) - YATES)^2 / E)
	PARAMETER <- (nr - 1L) * (nc - 1L)
	PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
	}
	}
	else {
	if(length(dim(x)) > 2L)
	stop("invalid 'x'")
	if (length(x) == 1L)
	stop("'x' must at least have 2 elements")
	if (length(x) != length(p))
	stop("'x' and 'p' must have the same number of elements")
	if(any(p < 0)) stop("probabilities must be non-negative.")
	if(abs(sum(p)-1) > sqrt(.Machine$double.eps)) {
	if(rescale.p) p <- p/sum(p)
	else stop("probabilities must sum to 1.")
	}
	METHOD <- "Chi-squared test for given probabilities"
	E <- n * p
	V <- n * p * (1 - p)
	STATISTIC <- sum((x - E) ^ 2 / E)
	names(E) <- names(x)
	if(simulate.p.value) {
	setMETH()
	nx <- length(x)
	sm <- matrix(sample.int(nx, B*n, TRUE, prob = p),nrow = n)
	ss <- apply(sm, 2L, function(x,E,k) {
	sum((table(factor(x, levels=1L:k)) - E)^2 / E)
	}, E = E, k = nx)
	PARAMETER <- NA
	PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B + 1)
	} else {
	PARAMETER <- length(x) - 1
	PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
	}
	}

	names(STATISTIC) <- "X-squared"
	names(PARAMETER) <- "df"
	if (any(E < 5) && is.finite(PARAMETER))
	warning("Chi-squared approximation may be incorrect")

	structure(list(statistic = STATISTIC,
	parameter = PARAMETER,
	p.value = PVAL,
	method = METHOD,
	data.name = DNAME,
	observed = x,
	expected = E,
	residuals = (x - E) / sqrt(E),
	stdres = (x - E) / sqrt(V) ),
	class = "htest")
	}