src/library/stats/R/ks.test.R - R - Git at Google

 #  File src/library/stats/R/ks.test.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2018 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 ks.test <-
     function(x, y, ..., alternative = c("two.sided", "less", "greater"),
              exact = NULL)
 {
     alternative <- match.arg(alternative)
     DNAME <- deparse(substitute(x))
     x <- x[!is.na(x)]
     n <- length(x)
     if(n < 1L)
         stop("not enough 'x' data")
     PVAL <- NULL

     if(is.numeric(y)) { ## two-sample case
         DNAME <- paste(DNAME, "and", deparse(substitute(y)))
         y <- y[!is.na(y)]
         n.x <- as.double(n)             # to avoid integer overflow
         n.y <- length(y)
         if(n.y < 1L)
             stop("not enough 'y' data")
         if(is.null(exact))
             exact <- (n.x * n.y < 10000)
         METHOD <- "Two-sample Kolmogorov-Smirnov test"
         TIES <- FALSE
         n <- n.x * n.y / (n.x + n.y)
         w <- c(x, y)
         z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y))
         if(length(unique(w)) < (n.x + n.y)) {
             if (exact) {
                 warning("cannot compute exact p-value with ties")
                 exact <- FALSE
             } else
                 warning("p-value will be approximate in the presence of ties")
             z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
             TIES <- TRUE
         }
         STATISTIC <- switch(alternative,
                             "two.sided" = max(abs(z)),
                             "greater" = max(z),
                             "less" = - min(z))
         nm_alternative <- switch(alternative,
                                  "two.sided" = "two-sided",
                                  "less" = "the CDF of x lies below that of y",
                                  "greater" = "the CDF of x lies above that of y")
         if(exact && (alternative == "two.sided") && !TIES)
             PVAL <- 1 - .Call(C_pSmirnov2x, STATISTIC, n.x, n.y)
     } else { ## one-sample case
         if(is.character(y)) # avoid matching anything in this function
             y <- get(y, mode = "function", envir = parent.frame())
         if(!is.function(y))
             stop("'y' must be numeric or a function or a string naming a valid function")
         METHOD <- "One-sample Kolmogorov-Smirnov test"
         TIES <- FALSE
         if(length(unique(x)) < n) {
             warning("ties should not be present for the Kolmogorov-Smirnov test")
             TIES <- TRUE
         }
         if(is.null(exact)) exact <- (n < 100) && !TIES
         x <- y(sort(x), ...) - (0 : (n-1)) / n
         STATISTIC <- switch(alternative,
                             "two.sided" = max(c(x, 1/n - x)),
                             "greater" = max(1/n - x),
                             "less" = max(x))
         if(exact) {
             PVAL <- 1 - if(alternative == "two.sided")
                 .Call(C_pKolmogorov2x, STATISTIC, n)
             else {
                 pkolmogorov1x <- function(x, n) {
                     ## Probability function for the one-sided
                     ## one-sample Kolmogorov statistics, based on the
                     ## formula of Birnbaum & Tingey (1951).
                     if(x <= 0) return(0)
                     if(x >= 1) return(1)
                     j <- seq.int(from = 0, to = floor(n * (1 - x)))
                     1 - x * sum(exp(lchoose(n, j)
                                     + (n - j) * log(1 - x - j / n)
                                     + (j - 1) * log(x + j / n)))
                 }
                 pkolmogorov1x(STATISTIC, n)
             }
         }
         nm_alternative <-
             switch(alternative,
                    "two.sided" = "two-sided",
                    "less" = "the CDF of x lies below the null hypothesis",
                    "greater" = "the CDF of x lies above the null hypothesis")
     }

     names(STATISTIC) <- switch(alternative,
                                "two.sided" = "D",
                                "greater" = "D^+",
                                "less" = "D^-")

     if(is.null(PVAL)) { ## so not exact
         pkstwo <- function(x, tol = 1e-6) {
             ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
             ## Not really needed at this generality for computing a single
             ## asymptotic p-value as below.
             if(is.numeric(x)) x <- as.double(x)
             else stop("argument 'x' must be numeric")
             p <- rep(0, length(x))
             p[is.na(x)] <- NA
             IND <- which(!is.na(x) & (x > 0))
             if(length(IND)) p[IND] <- .Call(C_pKS2, p = x[IND], tol)
             p
         }
         ## <FIXME>
         ## Currently, p-values for the two-sided two-sample case are
         ## exact if n.x * n.y < 10000 (unless controlled explicitly).
         ## In all other cases, the asymptotic distribution is used
         ## directly.  But: let m and n be the min and max of the sample
         ## sizes, respectively.  Then, according to Kim and Jennrich
         ## (1973), if m < n/10, we should use the
         ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
         ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
         PVAL <- if(alternative == "two.sided")
                     1 - pkstwo(sqrt(n) * STATISTIC)
                 else exp(- 2 * n * STATISTIC^2)
         ## </FIXME>
     }

     ## fix up possible overshoot (PR#14671)
     PVAL <- min(1.0, max(0.0, PVAL))
     RVAL <- list(statistic = STATISTIC,
                  p.value = PVAL,
                  alternative = nm_alternative,
                  method = METHOD,
                  data.name = DNAME)
     class(RVAL) <- "htest"
     return(RVAL)
 }
	# File src/library/stats/R/ks.test.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2018 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	ks.test <-
	function(x, y, ..., alternative = c("two.sided", "less", "greater"),
	exact = NULL)
	{
	alternative <- match.arg(alternative)
	DNAME <- deparse(substitute(x))
	x <- x[!is.na(x)]
	n <- length(x)
	if(n < 1L)
	stop("not enough 'x' data")
	PVAL <- NULL

	if(is.numeric(y)) { ## two-sample case
	DNAME <- paste(DNAME, "and", deparse(substitute(y)))
	y <- y[!is.na(y)]
	n.x <- as.double(n) # to avoid integer overflow
	n.y <- length(y)
	if(n.y < 1L)
	stop("not enough 'y' data")
	if(is.null(exact))
	exact <- (n.x * n.y < 10000)
	METHOD <- "Two-sample Kolmogorov-Smirnov test"
	TIES <- FALSE
	n <- n.x * n.y / (n.x + n.y)
	w <- c(x, y)
	z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y))
	if(length(unique(w)) < (n.x + n.y)) {
	if (exact) {
	warning("cannot compute exact p-value with ties")
	exact <- FALSE
	} else
	warning("p-value will be approximate in the presence of ties")
	z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
	TIES <- TRUE
	}
	STATISTIC <- switch(alternative,
	"two.sided" = max(abs(z)),
	"greater" = max(z),
	"less" = - min(z))
	nm_alternative <- switch(alternative,
	"two.sided" = "two-sided",
	"less" = "the CDF of x lies below that of y",
	"greater" = "the CDF of x lies above that of y")
	if(exact && (alternative == "two.sided") && !TIES)
	PVAL <- 1 - .Call(C_pSmirnov2x, STATISTIC, n.x, n.y)
	} else { ## one-sample case
	if(is.character(y)) # avoid matching anything in this function
	y <- get(y, mode = "function", envir = parent.frame())
	if(!is.function(y))
	stop("'y' must be numeric or a function or a string naming a valid function")
	METHOD <- "One-sample Kolmogorov-Smirnov test"
	TIES <- FALSE
	if(length(unique(x)) < n) {
	warning("ties should not be present for the Kolmogorov-Smirnov test")
	TIES <- TRUE
	}
	if(is.null(exact)) exact <- (n < 100) && !TIES
	x <- y(sort(x), ...) - (0 : (n-1)) / n
	STATISTIC <- switch(alternative,
	"two.sided" = max(c(x, 1/n - x)),
	"greater" = max(1/n - x),
	"less" = max(x))
	if(exact) {
	PVAL <- 1 - if(alternative == "two.sided")
	.Call(C_pKolmogorov2x, STATISTIC, n)
	else {
	pkolmogorov1x <- function(x, n) {
	## Probability function for the one-sided
	## one-sample Kolmogorov statistics, based on the
	## formula of Birnbaum & Tingey (1951).
	if(x <= 0) return(0)
	if(x >= 1) return(1)
	j <- seq.int(from = 0, to = floor(n * (1 - x)))
	1 - x * sum(exp(lchoose(n, j)
	+ (n - j) * log(1 - x - j / n)
	+ (j - 1) * log(x + j / n)))
	}
	pkolmogorov1x(STATISTIC, n)
	}
	}
	nm_alternative <-
	switch(alternative,
	"two.sided" = "two-sided",
	"less" = "the CDF of x lies below the null hypothesis",
	"greater" = "the CDF of x lies above the null hypothesis")
	}

	names(STATISTIC) <- switch(alternative,
	"two.sided" = "D",
	"greater" = "D^+",
	"less" = "D^-")

	if(is.null(PVAL)) { ## so not exact
	pkstwo <- function(x, tol = 1e-6) {
	## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
	## Not really needed at this generality for computing a single
	## asymptotic p-value as below.
	if(is.numeric(x)) x <- as.double(x)
	else stop("argument 'x' must be numeric")
	p <- rep(0, length(x))
	p[is.na(x)] <- NA
	IND <- which(!is.na(x) & (x > 0))
	if(length(IND)) p[IND] <- .Call(C_pKS2, p = x[IND], tol)
	p
	}
	## <FIXME>
	## Currently, p-values for the two-sided two-sample case are
	## exact if n.x * n.y < 10000 (unless controlled explicitly).
	## In all other cases, the asymptotic distribution is used
	## directly. But: let m and n be the min and max of the sample
	## sizes, respectively. Then, according to Kim and Jennrich
	## (1973), if m < n/10, we should use the
	## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
	## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
	PVAL <- if(alternative == "two.sided")
	1 - pkstwo(sqrt(n) * STATISTIC)
	else exp(- 2 * n * STATISTIC^2)
	## </FIXME>
	}

	## fix up possible overshoot (PR#14671)
	PVAL <- min(1.0, max(0.0, PVAL))
	RVAL <- list(statistic = STATISTIC,
	p.value = PVAL,
	alternative = nm_alternative,
	method = METHOD,
	data.name = DNAME)
	class(RVAL) <- "htest"
	return(RVAL)
	}