blob: 60b347e7e8e6610946f0bb09b82a6d2077aa3f7a [file] [log] [blame]
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1998-2017 The R Core Team
* Copyright (C) 2002-2017 The R Foundation
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <float.h>
#include <R.h>
#include <Rmath.h>
#include "stats.h"
#ifdef _OPENMP
# include <R_ext/MathThreads.h>
#endif
#define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b))
#ifdef R_160_and_older
#define both_non_NA both_FINITE
#else
#define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b))
#endif
static double R_euclidean(double *x, int nr, int nc, int i1, int i2)
{
double dev, dist;
int count, j;
count= 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
dev = (x[i1] - x[i2]);
if(!ISNAN(dev)) {
dist += dev * dev;
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return sqrt(dist);
}
static double R_maximum(double *x, int nr, int nc, int i1, int i2)
{
double dev, dist;
int count, j;
count = 0;
dist = -DBL_MAX;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
dev = fabs(x[i1] - x[i2]);
if(!ISNAN(dev)) {
if(dev > dist)
dist = dev;
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0) return NA_REAL;
return dist;
}
static double R_manhattan(double *x, int nr, int nc, int i1, int i2)
{
double dev, dist;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
dev = fabs(x[i1] - x[i2]);
if(!ISNAN(dev)) {
dist += dev;
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return dist;
}
static double R_canberra(double *x, int nr, int nc, int i1, int i2)
{
double dev, dist, sum, diff;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
sum = fabs(x[i1]) + fabs(x[i2]);
diff = fabs(x[i1] - x[i2]);
if (sum > DBL_MIN || diff > DBL_MIN) {
dev = diff/sum;
if(!ISNAN(dev) ||
(!R_FINITE(diff) && diff == sum &&
/* use Inf = lim x -> oo */ (dev = 1., TRUE))) {
dist += dev;
count++;
}
}
}
i1 += nr;
i2 += nr;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return dist;
}
static double R_dist_binary(double *x, int nr, int nc, int i1, int i2)
{
int total, count, dist;
int j;
total = 0;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
if(!both_FINITE(x[i1], x[i2])) {
warning(_("treating non-finite values as NA"));
}
else {
if(x[i1] != 0. || x[i2] != 0.) {
count++;
if( ! (x[i1] != 0. && x[i2] != 0.) ) dist++;
}
total++;
}
}
i1 += nr;
i2 += nr;
}
if(total == 0) return NA_REAL;
if(count == 0) return 0;
return (double) dist / count;
}
static double R_minkowski(double *x, int nr, int nc, int i1, int i2, double p)
{
double dev, dist;
int count, j;
count= 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(both_non_NA(x[i1], x[i2])) {
dev = (x[i1] - x[i2]);
if(!ISNAN(dev)) {
dist += R_pow(fabs(dev), p);
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return R_pow(dist, 1.0/p);
}
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY, MINKOWSKI };
/* == 1,2,..., defined by order in the R function dist */
void R_distance(double *x, int *nr, int *nc, double *d, int *diag,
int *method, double *p)
{
int dc, i, j;
size_t ij; /* can exceed 2^31 - 1 */
double (*distfun)(double*, int, int, int, int) = NULL;
#ifdef _OPENMP
int nthreads;
#endif
switch(*method) {
case EUCLIDEAN:
distfun = R_euclidean;
break;
case MAXIMUM:
distfun = R_maximum;
break;
case MANHATTAN:
distfun = R_manhattan;
break;
case CANBERRA:
distfun = R_canberra;
break;
case BINARY:
distfun = R_dist_binary;
break;
case MINKOWSKI:
if(!R_FINITE(*p) || *p <= 0)
error(_("distance(): invalid p"));
// plus special case below because of extra argument
break;
default:
error(_("distance(): invalid distance"));
}
dc = (*diag) ? 0 : 1; /* diag=1: we do the diagonal */
#ifdef _OPENMP
if (R_num_math_threads > 0)
nthreads = R_num_math_threads;
else
nthreads = 1; /* for now */
if (nthreads == 1) {
/* do the nthreads == 1 case without any OMP overhead to see
if it matters on some platforms */
ij = 0;
for(j = 0 ; j <= *nr ; j++)
for(i = j+dc ; i < *nr ; i++)
d[ij++] = (*method != MINKOWSKI) ?
distfun(x, *nr, *nc, i, j) :
R_minkowski(x, *nr, *nc, i, j, *p);
}
else
/* This produces uneven thread workloads since the outer loop
is over the subdiagonal portions of columns. An
alternative would be to use a loop on ij and to compute the
i and j values from ij. */
#pragma omp parallel for num_threads(nthreads) default(none) \
private(i, j, ij) \
firstprivate(nr, dc, d, method, distfun, nc, x, p)
for(j = 0 ; j <= *nr ; j++) {
ij = j * (*nr - dc) + j - ((1 + j) * j) / 2;
for(i = j+dc ; i < *nr ; i++)
d[ij++] = (*method != MINKOWSKI) ?
distfun(x, *nr, *nc, i, j) :
R_minkowski(x, *nr, *nc, i, j, *p);
}
#else
ij = 0;
for(j = 0 ; j <= *nr ; j++)
for(i = j+dc ; i < *nr ; i++)
d[ij++] = (*method != MINKOWSKI) ?
distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p);
#endif
}
#include <Rinternals.h>
SEXP Cdist(SEXP x, SEXP smethod, SEXP attrs, SEXP p)
{
SEXP ans;
int nr = nrows(x), nc = ncols(x), method = asInteger(smethod);
int diag = 0;
R_xlen_t N;
double rp = asReal(p);
N = (R_xlen_t)nr * (nr-1)/2; /* avoid int overflow for N ~ 50,000 */
PROTECT(ans = allocVector(REALSXP, N));
if(TYPEOF(x) != REALSXP) x = coerceVector(x, REALSXP);
PROTECT(x);
R_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method, &rp);
/* tack on attributes */
SEXP names = getAttrib(attrs, R_NamesSymbol);
for (int i = 0; i < LENGTH(attrs); i++)
setAttrib(ans, install(translateChar(STRING_ELT(names, i))),
VECTOR_ELT(attrs, i));
UNPROTECT(2);
return ans;
}