src/library/stats/src/distance.c - R - Git at Google

 /*
  *  R : A Computer Language for Statistical Data Analysis
  *  Copyright (C) 1998-2017  The R Core Team
  *  Copyright (C) 2002-2017  The R Foundation
  *  Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, a copy is available at
  *  https://www.R-project.org/Licenses/
  */

 #ifdef HAVE_CONFIG_H
 # include <config.h>
 #endif

 #include <float.h>

 #include <R.h>
 #include <Rmath.h>
 #include "stats.h"
 #ifdef _OPENMP
 # include <R_ext/MathThreads.h>
 #endif

 #define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b))
 #ifdef R_160_and_older
 #define both_non_NA both_FINITE
 #else
 #define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b))
 #endif

 static double R_euclidean(double *x, int nr, int nc, int i1, int i2)
 {
     double dev, dist;
     int count, j;

     count= 0;
     dist = 0;
     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    dev = (x[i1] - x[i2]);
 	    if(!ISNAN(dev)) {
 		dist += dev * dev;
 		count++;
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }
     if(count == 0) return NA_REAL;
     if(count != nc) dist /= ((double)count/nc);
     return sqrt(dist);
 }

 static double R_maximum(double *x, int nr, int nc, int i1, int i2)
 {
     double dev, dist;
     int count, j;

     count = 0;
     dist = -DBL_MAX;
     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    dev = fabs(x[i1] - x[i2]);
 	    if(!ISNAN(dev)) {
 		if(dev > dist)
 		    dist = dev;
 		count++;
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }
     if(count == 0) return NA_REAL;
     return dist;
 }

 static double R_manhattan(double *x, int nr, int nc, int i1, int i2)
 {
     double dev, dist;
     int count, j;

     count = 0;
     dist = 0;
     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    dev = fabs(x[i1] - x[i2]);
 	    if(!ISNAN(dev)) {
 		dist += dev;
 		count++;
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }
     if(count == 0) return NA_REAL;
     if(count != nc) dist /= ((double)count/nc);
     return dist;
 }

 static double R_canberra(double *x, int nr, int nc, int i1, int i2)
 {
     double dev, dist, sum, diff;
     int count, j;

     count = 0;
     dist = 0;
     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    sum = fabs(x[i1]) + fabs(x[i2]);
 	    diff = fabs(x[i1] - x[i2]);
 	    if (sum > DBL_MIN || diff > DBL_MIN) {
 		dev = diff/sum;
 		if(!ISNAN(dev) ||
 		   (!R_FINITE(diff) && diff == sum &&
 		    /* use Inf = lim x -> oo */ (dev = 1., TRUE))) {
 		    dist += dev;
 		    count++;
 		}
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }
     if(count == 0) return NA_REAL;
     if(count != nc) dist /= ((double)count/nc);
     return dist;
 }

 static double R_dist_binary(double *x, int nr, int nc, int i1, int i2)
 {
     int total, count, dist;
     int j;

     total = 0;
     count = 0;
     dist = 0;

     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    if(!both_FINITE(x[i1], x[i2])) {
 		warning(_("treating non-finite values as NA"));
 	    }
 	    else {
 		if(x[i1] != 0. || x[i2] != 0.) {
 		    count++;
 		    if( ! (x[i1] != 0. && x[i2] != 0.) ) dist++;
 		}
 		total++;
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }

     if(total == 0) return NA_REAL;
     if(count == 0) return 0;
     return (double) dist / count;
 }

 static double R_minkowski(double *x, int nr, int nc, int i1, int i2, double p)
 {
     double dev, dist;
     int count, j;

     count= 0;
     dist = 0;
     for(j = 0 ; j < nc ; j++) {
 	if(both_non_NA(x[i1], x[i2])) {
 	    dev = (x[i1] - x[i2]);
 	    if(!ISNAN(dev)) {
 		dist += R_pow(fabs(dev), p);
 		count++;
 	    }
 	}
 	i1 += nr;
 	i2 += nr;
     }
     if(count == 0) return NA_REAL;
     if(count != nc) dist /= ((double)count/nc);
     return R_pow(dist, 1.0/p);
 }

 enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY, MINKOWSKI };
 /* == 1,2,..., defined by order in the R function dist */

 void R_distance(double *x, int *nr, int *nc, double *d, int *diag,
 		int *method, double *p)
 {
     int dc, i, j;
     size_t  ij;  /* can exceed 2^31 - 1 */
     double (*distfun)(double*, int, int, int, int) = NULL;
 #ifdef _OPENMP
     int nthreads;
 #endif

     switch(*method) {
     case EUCLIDEAN:
 	distfun = R_euclidean;
 	break;
     case MAXIMUM:
 	distfun = R_maximum;
 	break;
     case MANHATTAN:
 	distfun = R_manhattan;
 	break;
     case CANBERRA:
 	distfun = R_canberra;
 	break;
     case BINARY:
 	distfun = R_dist_binary;
 	break;
     case MINKOWSKI:
 	if(!R_FINITE(*p) || *p <= 0)
 	    error(_("distance(): invalid p"));
 	// plus special case below because of extra argument
 	break;
     default:
 	error(_("distance(): invalid distance"));
     }
     dc = (*diag) ? 0 : 1; /* diag=1:  we do the diagonal */
 #ifdef _OPENMP
     if (R_num_math_threads > 0)
 	nthreads = R_num_math_threads;
     else
 	nthreads = 1; /* for now */
     if (nthreads == 1) {
 	/* do the nthreads == 1 case without any OMP overhead to see
 	   if it matters on some platforms */
 	ij = 0;
 	for(j = 0 ; j <= *nr ; j++)
 	    for(i = j+dc ; i < *nr ; i++)
 		d[ij++] = (*method != MINKOWSKI) ?
 		    distfun(x, *nr, *nc, i, j) :
 		    R_minkowski(x, *nr, *nc, i, j, *p);
     }
     else
 	/* This produces uneven thread workloads since the outer loop
 	   is over the subdiagonal portions of columns.  An
 	   alternative would be to use a loop on ij and to compute the
 	   i and j values from ij. */
 #pragma omp parallel for num_threads(nthreads) default(none)	\
     private(i, j, ij)						\
     firstprivate(nr, dc, d, method, distfun, nc, x, p)
 	for(j = 0 ; j <= *nr ; j++) {
 	    ij = j * (*nr - dc) + j - ((1 + j) * j) / 2;
 	    for(i = j+dc ; i < *nr ; i++)
 		d[ij++] = (*method != MINKOWSKI) ?
 		    distfun(x, *nr, *nc, i, j) :
 		    R_minkowski(x, *nr, *nc, i, j, *p);
 	}
 #else
     ij = 0;
     for(j = 0 ; j <= *nr ; j++)
 	for(i = j+dc ; i < *nr ; i++)
 	    d[ij++] = (*method != MINKOWSKI) ?
 		distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p);
 #endif
 }

 #include <Rinternals.h>

 SEXP Cdist(SEXP x, SEXP smethod, SEXP attrs, SEXP p)
 {
     SEXP ans;
     int nr = nrows(x), nc = ncols(x), method = asInteger(smethod);
     int diag = 0;
     R_xlen_t N;
     double rp = asReal(p);
     N = (R_xlen_t)nr * (nr-1)/2; /* avoid int overflow for N ~ 50,000 */
     PROTECT(ans = allocVector(REALSXP, N));
     if(TYPEOF(x) != REALSXP) x = coerceVector(x, REALSXP);
     PROTECT(x);
     R_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method, &rp);
     /* tack on attributes */
     SEXP names = getAttrib(attrs, R_NamesSymbol);
     for (int i = 0; i < LENGTH(attrs); i++)
 	setAttrib(ans, install(translateChar(STRING_ELT(names, i))),
 		  VECTOR_ELT(attrs, i));
     UNPROTECT(2);
     return ans;
 }
	/*
	* R : A Computer Language for Statistical Data Analysis
	* Copyright (C) 1998-2017 The R Core Team
	* Copyright (C) 2002-2017 The R Foundation
	* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, a copy is available at
	* https://www.R-project.org/Licenses/
	*/

	#ifdef HAVE_CONFIG_H
	# include <config.h>
	#endif

	#include <float.h>

	#include <R.h>
	#include <Rmath.h>
	#include "stats.h"
	#ifdef _OPENMP
	# include <R_ext/MathThreads.h>
	#endif

	#define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b))
	#ifdef R_160_and_older
	#define both_non_NA both_FINITE
	#else
	#define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b))
	#endif

	static double R_euclidean(double *x, int nr, int nc, int i1, int i2)
	{
	double dev, dist;
	int count, j;

	count= 0;
	dist = 0;
	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	dev = (x[i1] - x[i2]);
	if(!ISNAN(dev)) {
	dist += dev * dev;
	count++;
	}
	}
	i1 += nr;
	i2 += nr;
	}
	if(count == 0) return NA_REAL;
	if(count != nc) dist /= ((double)count/nc);
	return sqrt(dist);
	}

	static double R_maximum(double *x, int nr, int nc, int i1, int i2)
	{
	double dev, dist;
	int count, j;

	count = 0;
	dist = -DBL_MAX;
	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	dev = fabs(x[i1] - x[i2]);
	if(!ISNAN(dev)) {
	if(dev > dist)
	dist = dev;
	count++;
	}
	}
	i1 += nr;
	i2 += nr;
	}
	if(count == 0) return NA_REAL;
	return dist;
	}

	static double R_manhattan(double *x, int nr, int nc, int i1, int i2)
	{
	double dev, dist;
	int count, j;

	count = 0;
	dist = 0;
	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	dev = fabs(x[i1] - x[i2]);
	if(!ISNAN(dev)) {
	dist += dev;
	count++;
	}
	}
	i1 += nr;
	i2 += nr;
	}
	if(count == 0) return NA_REAL;
	if(count != nc) dist /= ((double)count/nc);
	return dist;
	}

	static double R_canberra(double *x, int nr, int nc, int i1, int i2)
	{
	double dev, dist, sum, diff;
	int count, j;

	count = 0;
	dist = 0;
	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	sum = fabs(x[i1]) + fabs(x[i2]);
	diff = fabs(x[i1] - x[i2]);
	if (sum > DBL_MIN \|\| diff > DBL_MIN) {
	dev = diff/sum;
	if(!ISNAN(dev) \|\|
	(!R_FINITE(diff) && diff == sum &&
	/* use Inf = lim x -> oo */ (dev = 1., TRUE))) {
	dist += dev;
	count++;
	}
	}
	}
	i1 += nr;
	i2 += nr;
	}
	if(count == 0) return NA_REAL;
	if(count != nc) dist /= ((double)count/nc);
	return dist;
	}

	static double R_dist_binary(double *x, int nr, int nc, int i1, int i2)
	{
	int total, count, dist;
	int j;

	total = 0;
	count = 0;
	dist = 0;

	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	if(!both_FINITE(x[i1], x[i2])) {
	warning(_("treating non-finite values as NA"));
	}
	else {
	if(x[i1] != 0. \|\| x[i2] != 0.) {
	count++;
	if( ! (x[i1] != 0. && x[i2] != 0.) ) dist++;
	}
	total++;
	}
	}
	i1 += nr;
	i2 += nr;
	}

	if(total == 0) return NA_REAL;
	if(count == 0) return 0;
	return (double) dist / count;
	}

	static double R_minkowski(double *x, int nr, int nc, int i1, int i2, double p)
	{
	double dev, dist;
	int count, j;

	count= 0;
	dist = 0;
	for(j = 0 ; j < nc ; j++) {
	if(both_non_NA(x[i1], x[i2])) {
	dev = (x[i1] - x[i2]);
	if(!ISNAN(dev)) {
	dist += R_pow(fabs(dev), p);
	count++;
	}
	}
	i1 += nr;
	i2 += nr;
	}
	if(count == 0) return NA_REAL;
	if(count != nc) dist /= ((double)count/nc);
	return R_pow(dist, 1.0/p);
	}

	enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY, MINKOWSKI };
	/* == 1,2,..., defined by order in the R function dist */

	void R_distance(double x, int nr, int nc, double d, int *diag,
	int method, double p)
	{
	int dc, i, j;
	size_t ij; /* can exceed 2^31 - 1 */
	double (distfun)(double, int, int, int, int) = NULL;
	#ifdef _OPENMP
	int nthreads;
	#endif

	switch(*method) {
	case EUCLIDEAN:
	distfun = R_euclidean;
	break;
	case MAXIMUM:
	distfun = R_maximum;
	break;
	case MANHATTAN:
	distfun = R_manhattan;
	break;
	case CANBERRA:
	distfun = R_canberra;
	break;
	case BINARY:
	distfun = R_dist_binary;
	break;
	case MINKOWSKI:
	if(!R_FINITE(p) \|\| p <= 0)
	error(_("distance(): invalid p"));
	// plus special case below because of extra argument
	break;
	default:
	error(_("distance(): invalid distance"));
	}
	dc = (diag) ? 0 : 1; / diag=1: we do the diagonal */
	#ifdef _OPENMP
	if (R_num_math_threads > 0)
	nthreads = R_num_math_threads;
	else
	nthreads = 1; /* for now */
	if (nthreads == 1) {
	/* do the nthreads == 1 case without any OMP overhead to see
	if it matters on some platforms */
	ij = 0;
	for(j = 0 ; j <= *nr ; j++)
	for(i = j+dc ; i < *nr ; i++)
	d[ij++] = (*method != MINKOWSKI) ?
	distfun(x, nr, nc, i, j) :
	R_minkowski(x, nr, nc, i, j, *p);
	}
	else
	/* This produces uneven thread workloads since the outer loop
	is over the subdiagonal portions of columns. An
	alternative would be to use a loop on ij and to compute the
	i and j values from ij. */
	#pragma omp parallel for num_threads(nthreads) default(none) \
	private(i, j, ij) \
	firstprivate(nr, dc, d, method, distfun, nc, x, p)
	for(j = 0 ; j <= *nr ; j++) {
	ij = j * (nr - dc) + j - ((1 + j) j) / 2;
	for(i = j+dc ; i < *nr ; i++)
	d[ij++] = (*method != MINKOWSKI) ?
	distfun(x, nr, nc, i, j) :
	R_minkowski(x, nr, nc, i, j, *p);
	}
	#else
	ij = 0;
	for(j = 0 ; j <= *nr ; j++)
	for(i = j+dc ; i < *nr ; i++)
	d[ij++] = (*method != MINKOWSKI) ?
	distfun(x, nr, nc, i, j) : R_minkowski(x, nr, nc, i, j, *p);
	#endif
	}

	#include <Rinternals.h>

	SEXP Cdist(SEXP x, SEXP smethod, SEXP attrs, SEXP p)
	{
	SEXP ans;
	int nr = nrows(x), nc = ncols(x), method = asInteger(smethod);
	int diag = 0;
	R_xlen_t N;
	double rp = asReal(p);
	N = (R_xlen_t)nr * (nr-1)/2; /* avoid int overflow for N ~ 50,000 */
	PROTECT(ans = allocVector(REALSXP, N));
	if(TYPEOF(x) != REALSXP) x = coerceVector(x, REALSXP);
	PROTECT(x);
	R_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method, &rp);
	/* tack on attributes */
	SEXP names = getAttrib(attrs, R_NamesSymbol);
	for (int i = 0; i < LENGTH(attrs); i++)
	setAttrib(ans, install(translateChar(STRING_ELT(names, i))),
	VECTOR_ELT(attrs, i));
	UNPROTECT(2);
	return ans;
	}