src/library/base/R/merge.R - R - Git at Google

 #  File src/library/base/R/merge.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 1995-2018 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 merge <- function(x, y, ...) UseMethod("merge")

 merge.default <- function(x, y, ...)
     merge(as.data.frame(x), as.data.frame(y), ...)

 merge.data.frame <-
     function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by,
              all = FALSE, all.x = all, all.y = all,
              sort = TRUE, suffixes = c(".x",".y"), no.dups = TRUE,
              incomparables = NULL, ...)
 {
     fix.by <- function(by, df)
     {
         ## fix up 'by' to be a valid set of cols by number: 0 is row.names
         if(is.null(by)) by <- numeric()
         by <- as.vector(by)
         nc <- ncol(df)
         if(is.character(by)) {
             poss <- c("row.names", names(df))
             # names(df) are not necessarily unique, so check for multiple matches.
             if(any(bad <- !charmatch(by, poss, 0L)))
                 stop(ngettext(sum(bad),
                               "'by' must specify a uniquely valid column",
                               "'by' must specify uniquely valid columns"),
                      domain = NA)
             by <- match(by, poss) - 1L
         } else if(is.numeric(by)) {
             if(any(by < 0L) || any(by > nc))
                 stop("'by' must match numbers of columns")
         } else if(is.logical(by)) {
             if(length(by) != nc) stop("'by' must match number of columns")
             by <- seq_along(by)[by]
         } else stop("'by' must specify one or more columns as numbers, names or logical")
         if(any(bad <- is.na(by)))
             stop(ngettext(sum(bad),
                           "'by' must specify a uniquely valid column",
                           "'by' must specify uniquely valid columns"),
                  domain = NA)
          unique(by)
     }

     nx <- nrow(x <- as.data.frame(x)); ny <- nrow(y <- as.data.frame(y))
     if (nx >= 2^31 || ny >= 2^31) stop("long vectors are not supported")
     by.x <- fix.by(by.x, x)
     by.y <- fix.by(by.y, y)
     if((l.b <- length(by.x)) != length(by.y))
         stop("'by.x' and 'by.y' specify different numbers of columns")
     if(l.b == 0L) {
         ## return the cartesian product of x and y, fixing up common names
         nm <- nm.x <- names(x)
         nm.y <- names(y)
         has.common.nms <- any(cnm <- nm.x %in% nm.y)
         if(has.common.nms) {
             names(x)[cnm] <- paste0(nm.x[cnm], suffixes[1L])
             cnm <- nm.y %in% nm
             names(y)[cnm] <- paste0(nm.y[cnm], suffixes[2L])
         }
         if (nx == 0L || ny == 0L) {
             res <- cbind(x[FALSE, ], y[FALSE, ])
         } else {
             ij <- expand.grid(seq_len(nx), seq_len(ny))
             res <- cbind(x[ij[, 1L], , drop = FALSE],
                          y[ij[, 2L], , drop = FALSE])
         }
     }
     else {
         if(any(by.x == 0L)) {
             x <- cbind(Row.names = I(row.names(x)), x)
             by.x <- by.x + 1L
         }
         if(any(by.y == 0L)) {
             y <- cbind(Row.names = I(row.names(y)), y)
             by.y <- by.y + 1L
         }
         row.names(x) <- NULL
         row.names(y) <- NULL
         ## create keys from 'by' columns:
         if(l.b == 1L) {                  # (be faster)
             bx <- x[, by.x]; if(is.factor(bx)) bx <- as.character(bx)
             by <- y[, by.y]; if(is.factor(by)) by <- as.character(by)
         } else {
             if (!is.null(incomparables))
                 stop("'incomparables' is supported only for merging on a single column")
             ## Do these together for consistency in as.character.
             ## Use same set of names.
             bx <- x[, by.x, drop=FALSE]; by <- y[, by.y, drop=FALSE]
             names(bx) <- names(by) <- paste0("V", seq_len(ncol(bx)))
             bz <- do.call("paste", c(rbind(bx, by), sep = "\r"))
             bx <- bz[seq_len(nx)]
             by <- bz[nx + seq_len(ny)]
         }
         comm <- match(bx, by, 0L)
         bxy <- bx[comm > 0L]             # the keys which are in both
         xinds <- match(bx, bxy, 0L, incomparables)
         yinds <- match(by, bxy, 0L, incomparables)
         if(nx > 0L && ny > 0L)
             m <- .Internal(merge(xinds, yinds, all.x, all.y))
         else
             m <- list(xi = integer(), yi = integer(),
                       x.alone = seq_len(nx), y.alone = seq_len(ny))
         nm <- nm.x <- names(x)[-by.x]
         nm.by <- names(x)[by.x]
         nm.y <- names(y)[-by.y]
         ncx <- ncol(x)
         if(all.x) all.x <- (nxx <- length(m$x.alone)) > 0L
         if(all.y) all.y <- (nyy <- length(m$y.alone)) > 0L
         lxy <- length(m$xi)             # == length(m$yi)
         ## x = [ by | x ] :
         has.common.nms <- any(cnm <- nm.x %in% nm.y)
         if(has.common.nms && nzchar(suffixes[1L]))
             nm.x[cnm] <- paste0(nm.x[cnm], suffixes[1L])
         x <- x[c(m$xi, if(all.x) m$x.alone),
                c(by.x, seq_len(ncx)[-by.x]), drop=FALSE]
         names(x) <- c(nm.by, nm.x)
         if(all.y) { ## add the 'y.alone' rows to x[]
             ## need to have factor levels extended as well -> using [cr]bind
             ya <- y[m$y.alone, by.y, drop = FALSE]
             names(ya) <- nm.by
             ## this used to use a logical matrix, but that was not good
             ## enough as x could be zero-row.
             ## workaround possibly duplicated names: PR#15618
             xa <- x[rep.int(NA_integer_, nyy), nm.x, drop=FALSE ]
             names(xa) <- nm.x
             x <- rbind(x, cbind(ya, xa))
         }
         ## y (w/o 'by'):
         if(has.common.nms && nzchar(suffixes[2L])) {
             cnm <- nm.y %in% nm
             nm.y[cnm] <- paste0(nm.y[cnm], suffixes[2L])
         }
         y <- y[c(m$yi, if(all.x) rep.int(1L, nxx), if(all.y) m$y.alone),
                -by.y, drop = FALSE]
         if(all.x) {
             zap <- (lxy+1L):(lxy+nxx)
             for(i in seq_along(y)) {
                 ## do it this way to invoke methods for e.g. factor
                 if(is.matrix(y[[1]])) y[[1]][zap, ] <- NA
                 else is.na(y[[i]]) <- zap
             }
         }

         if(has.common.nms) names(y) <- nm.y
         ## If by.x %in% names(y) , duplicate column names may still arise.
         ## Apply suffixes to just y - keeping back compatibility when referring to by.x:
         if(no.dups && any((mi <- match(nm.by, names(y), 0L)) > 0L) && nzchar(suffixes[2L]))
             names(y)[mi] <- paste0(names(y)[mi], suffixes[2L])
         ## if(!no.dups) ## <<- speedup if we were sure we'd caught all dups above
         nm <- c(names(x), names(y))
         if(any(d <- duplicated(nm)))
             if(sum(d) > 1L)
                 warning("column names ",
                         paste(sQuote(nm[d]), collapse = ", "),
                         " are duplicated in the result", domain = NA)
             else
                 warning("column name ", sQuote(nm[d]),
                         " is duplicated in the result", domain = NA)
         res <- cbind(x, y)

         if (sort)
 	    res <- res[if(all.x || all.y) {
 			   x <- x[, seq_len(l.b), drop = FALSE]
 			   attributes(x) <- NULL
 			   do.call("order", x)
 		       } else sort.list(bx[m$xi]),, drop = FALSE]
     }
     attr(res, "row.names") <- .set_row_names(nrow(res))
     res
 }
	# File src/library/base/R/merge.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 1995-2018 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	merge <- function(x, y, ...) UseMethod("merge")

	merge.default <- function(x, y, ...)
	merge(as.data.frame(x), as.data.frame(y), ...)

	merge.data.frame <-
	function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by,
	all = FALSE, all.x = all, all.y = all,
	sort = TRUE, suffixes = c(".x",".y"), no.dups = TRUE,
	incomparables = NULL, ...)
	{
	fix.by <- function(by, df)
	{
	## fix up 'by' to be a valid set of cols by number: 0 is row.names
	if(is.null(by)) by <- numeric()
	by <- as.vector(by)
	nc <- ncol(df)
	if(is.character(by)) {
	poss <- c("row.names", names(df))
	# names(df) are not necessarily unique, so check for multiple matches.
	if(any(bad <- !charmatch(by, poss, 0L)))
	stop(ngettext(sum(bad),
	"'by' must specify a uniquely valid column",
	"'by' must specify uniquely valid columns"),
	domain = NA)
	by <- match(by, poss) - 1L
	} else if(is.numeric(by)) {
	if(any(by < 0L) \|\| any(by > nc))
	stop("'by' must match numbers of columns")
	} else if(is.logical(by)) {
	if(length(by) != nc) stop("'by' must match number of columns")
	by <- seq_along(by)[by]
	} else stop("'by' must specify one or more columns as numbers, names or logical")
	if(any(bad <- is.na(by)))
	stop(ngettext(sum(bad),
	"'by' must specify a uniquely valid column",
	"'by' must specify uniquely valid columns"),
	domain = NA)
	unique(by)
	}

	nx <- nrow(x <- as.data.frame(x)); ny <- nrow(y <- as.data.frame(y))
	if (nx >= 2^31 \|\| ny >= 2^31) stop("long vectors are not supported")
	by.x <- fix.by(by.x, x)
	by.y <- fix.by(by.y, y)
	if((l.b <- length(by.x)) != length(by.y))
	stop("'by.x' and 'by.y' specify different numbers of columns")
	if(l.b == 0L) {
	## return the cartesian product of x and y, fixing up common names
	nm <- nm.x <- names(x)
	nm.y <- names(y)
	has.common.nms <- any(cnm <- nm.x %in% nm.y)
	if(has.common.nms) {
	names(x)[cnm] <- paste0(nm.x[cnm], suffixes[1L])
	cnm <- nm.y %in% nm
	names(y)[cnm] <- paste0(nm.y[cnm], suffixes[2L])
	}
	if (nx == 0L \|\| ny == 0L) {
	res <- cbind(x[FALSE, ], y[FALSE, ])
	} else {
	ij <- expand.grid(seq_len(nx), seq_len(ny))
	res <- cbind(x[ij[, 1L], , drop = FALSE],
	y[ij[, 2L], , drop = FALSE])
	}
	}
	else {
	if(any(by.x == 0L)) {
	x <- cbind(Row.names = I(row.names(x)), x)
	by.x <- by.x + 1L
	}
	if(any(by.y == 0L)) {
	y <- cbind(Row.names = I(row.names(y)), y)
	by.y <- by.y + 1L
	}
	row.names(x) <- NULL
	row.names(y) <- NULL
	## create keys from 'by' columns:
	if(l.b == 1L) { # (be faster)
	bx <- x[, by.x]; if(is.factor(bx)) bx <- as.character(bx)
	by <- y[, by.y]; if(is.factor(by)) by <- as.character(by)
	} else {
	if (!is.null(incomparables))
	stop("'incomparables' is supported only for merging on a single column")
	## Do these together for consistency in as.character.
	## Use same set of names.
	bx <- x[, by.x, drop=FALSE]; by <- y[, by.y, drop=FALSE]
	names(bx) <- names(by) <- paste0("V", seq_len(ncol(bx)))
	bz <- do.call("paste", c(rbind(bx, by), sep = "\r"))
	bx <- bz[seq_len(nx)]
	by <- bz[nx + seq_len(ny)]
	}
	comm <- match(bx, by, 0L)
	bxy <- bx[comm > 0L] # the keys which are in both
	xinds <- match(bx, bxy, 0L, incomparables)
	yinds <- match(by, bxy, 0L, incomparables)
	if(nx > 0L && ny > 0L)
	m <- .Internal(merge(xinds, yinds, all.x, all.y))
	else
	m <- list(xi = integer(), yi = integer(),
	x.alone = seq_len(nx), y.alone = seq_len(ny))
	nm <- nm.x <- names(x)[-by.x]
	nm.by <- names(x)[by.x]
	nm.y <- names(y)[-by.y]
	ncx <- ncol(x)
	if(all.x) all.x <- (nxx <- length(m$x.alone)) > 0L
	if(all.y) all.y <- (nyy <- length(m$y.alone)) > 0L
	lxy <- length(m$xi) # == length(m$yi)
	## x = [ by \| x ] :
	has.common.nms <- any(cnm <- nm.x %in% nm.y)
	if(has.common.nms && nzchar(suffixes[1L]))
	nm.x[cnm] <- paste0(nm.x[cnm], suffixes[1L])
	x <- x[c(m$xi, if(all.x) m$x.alone),
	c(by.x, seq_len(ncx)[-by.x]), drop=FALSE]
	names(x) <- c(nm.by, nm.x)
	if(all.y) { ## add the 'y.alone' rows to x[]
	## need to have factor levels extended as well -> using [cr]bind
	ya <- y[m$y.alone, by.y, drop = FALSE]
	names(ya) <- nm.by
	## this used to use a logical matrix, but that was not good
	## enough as x could be zero-row.
	## workaround possibly duplicated names: PR#15618
	xa <- x[rep.int(NA_integer_, nyy), nm.x, drop=FALSE ]
	names(xa) <- nm.x
	x <- rbind(x, cbind(ya, xa))
	}
	## y (w/o 'by'):
	if(has.common.nms && nzchar(suffixes[2L])) {
	cnm <- nm.y %in% nm
	nm.y[cnm] <- paste0(nm.y[cnm], suffixes[2L])
	}
	y <- y[c(m$yi, if(all.x) rep.int(1L, nxx), if(all.y) m$y.alone),
	-by.y, drop = FALSE]
	if(all.x) {
	zap <- (lxy+1L):(lxy+nxx)
	for(i in seq_along(y)) {
	## do it this way to invoke methods for e.g. factor
	if(is.matrix(y[[1]])) y[[1]][zap, ] <- NA
	else is.na(y[[i]]) <- zap
	}
	}

	if(has.common.nms) names(y) <- nm.y
	## If by.x %in% names(y) , duplicate column names may still arise.
	## Apply suffixes to just y - keeping back compatibility when referring to by.x:
	if(no.dups && any((mi <- match(nm.by, names(y), 0L)) > 0L) && nzchar(suffixes[2L]))
	names(y)[mi] <- paste0(names(y)[mi], suffixes[2L])
	## if(!no.dups) ## <<- speedup if we were sure we'd caught all dups above
	nm <- c(names(x), names(y))
	if(any(d <- duplicated(nm)))
	if(sum(d) > 1L)
	warning("column names ",
	paste(sQuote(nm[d]), collapse = ", "),
	" are duplicated in the result", domain = NA)
	else
	warning("column name ", sQuote(nm[d]),
	" is duplicated in the result", domain = NA)
	res <- cbind(x, y)

	if (sort)
	res <- res[if(all.x \|\| all.y) {
	x <- x[, seq_len(l.b), drop = FALSE]
	attributes(x) <- NULL
	do.call("order", x)
	} else sort.list(bx[m$xi]),, drop = FALSE]
	}
	attr(res, "row.names") <- .set_row_names(nrow(res))
	res
	}