blob: a36f15d5d15dd009ef8a592dab2f935d085e3762 [file] [log] [blame]
# File src/library/base/R/dcf.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 1995-2017 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
read.dcf <-
function(file, fields = NULL, all = FALSE, keep.white = NULL)
{
if(is.character(file)){
file <- gzfile(file)
on.exit(close(file))
}
if(!inherits(file, "connection"))
stop("'file' must be a character string or connection")
## For historical reasons, the default is not to accumulate repeated
## fields in a record (in fact picking the *last* field occurrence).
## Use the internal code for performance reasons, but note that we
## could of course as well use
## do.call("cbind",
## lapply(out,
## function(s)
## if(is.atomic(s)) s
## else mapply("[[", s, lengths(s))))
if(!all) return(.Internal(readDCF(file, fields, keep.white)))
.assemble_things_into_a_data_frame <- function(tags, vals, nums) {
tf <- factor(tags, levels = unique(tags))
cnts <- table(nums, tf)
out <- array(NA_character_, dim = dim(cnts),
dimnames = list(NULL, levels(tf)))
if(all(cnts <= 1L)) {
## No repeated tags ...
out[cbind(nums, tf)] <- vals
out <- as.data.frame(out, stringsAsFactors = FALSE)
}
else {
levs <- colSums(cnts > 1L) == 0L
if(any(levs)) {
inds <- tf %in% levels(tf)[levs]
out[cbind(nums[inds], tf[inds])] <- vals[inds]
}
out <- as.data.frame(out, stringsAsFactors = FALSE)
for(l in levels(tf)[!levs]) {
out[[l]] <- rep.int(list(NA_character_), nrow(cnts))
i <- tf == l
out[[l]][unique(nums[i])] <- split(vals[i], nums[i])
}
}
out
}
## This needs to be done in an 8-bit locale,
## both for the regexps and strtrim().
ctype <- Sys.getlocale("LC_CTYPE")
on.exit(Sys.setlocale("LC_CTYPE", ctype), add = TRUE)
Sys.setlocale("LC_CTYPE", "C")
lines <- readLines(file, skipNul = TRUE)
## Try to find out about invalid things: mostly, lines which do not
## start with blanks but have no ':' ...
ind <- grep("^[^[:blank:]][^:]*$", lines)
if(length(ind)) {
lines <- strtrim(lines[ind], 0.7 * getOption("width"))
stop(gettextf("Invalid DCF format.\nRegular lines must have a tag.\nOffending lines start with:\n%s",
paste0(" ", lines, collapse = "\n")),
domain = NA)
}
line_is_not_empty <- !grepl("^[[:space:]]*$", lines)
nums <- cumsum(diff(c(FALSE, line_is_not_empty) > 0L) > 0L)
## Remove the empty ones so that nums knows which record each line
## belongs to.
nums <- nums[line_is_not_empty]
lines <- lines[line_is_not_empty]
## Deal with escaped blank lines (used by Debian at least for the
## Description: values, see man 5 deb-control):
line_is_escaped_blank <- grepl("^[[:space:]]+\\.[[:space:]]*$", lines)
if(any(line_is_escaped_blank))
lines[line_is_escaped_blank] <- ""
line_has_tag <- grepl("^[^[:blank:]][^:]*:", lines)
## Check that records start with tag lines.
pos <- which(diff(nums) > 0L) + 1L
ind <- !line_has_tag[pos]
if(any(ind)) {
lines <- strtrim(lines[pos[ind]], 0.7 * getOption("width"))
stop(gettextf("Invalid DCF format.\nContinuation lines must not start a record.\nOffending lines start with:\n%s",
paste0(" ", lines, collapse = "\n")),
domain = NA)
}
lengths <- rle(cumsum(line_has_tag))$lengths
## End positions of field entries.
pos <- cumsum(lengths)
tags <- sub(":.*", "", lines[line_has_tag])
lines[line_has_tag] <-
sub("[^:]*:[[:space:]]*", "", lines[line_has_tag])
fold <- is.na(match(tags, keep.white))
foldable <- rep.int(fold, lengths)
lines[foldable] <- sub("^[[:space:]]*", "", lines[foldable])
lines[foldable] <- sub("[[:space:]]*$", "", lines[foldable])
vals <- mapply(function(from, to) paste(lines[from:to],
collapse = "\n"),
c(1L, pos[-length(pos)] + 1L), pos)
vals[fold] <- trimws(vals[fold])
out <- .assemble_things_into_a_data_frame(tags, vals, nums[pos])
if(!is.null(fields))
out <- out[fields]
out
}
write.dcf <-
function(x, file = "", append = FALSE, useBytes = FALSE,
indent = 0.1 * getOption("width"),
width = 0.9 * getOption("width"),
keep.white = NULL)
{
if(file == "")
file <- stdout()
else if(is.character(file)) {
file <- file(file, if(append) "a" else "w")
on.exit(close(file))
}
if(!inherits(file, "connection"))
stop("'file' must be a character string or connection")
## We need to take care of two things:
## * We really should not write out NA entries.
## * We have to handle multiple fields per record.
## do not assume that the input is valid in this locale
escape_paragraphs <- function(s)
gsub("\n \\.([^\n])","\n .\\1",
gsub("\n[ \t]*\n", "\n .\n ", s, perl = TRUE, useBytes = TRUE),
perl = TRUE, useBytes = TRUE)
fmt <- function(tag, val, fold = TRUE) {
s <- if(fold)
formatDL(rep.int(tag, length(val)), val, style = "list",
width = width, indent = indent)
else {
## Need to ensure a leading whitespace for continuation
## lines.
sprintf("%s: %s", tag,
gsub("\n([^[:blank:]])", "\n \\1", val))
}
escape_paragraphs(s)
}
if(!is.data.frame(x))
x <- as.data.frame(x, stringsAsFactors = FALSE)
nmx <- names(x)
out <- matrix("", nrow(x), ncol(x))
foldable <- is.na(match(nmx, keep.white))
for(j in seq_along(x)) {
xj <- x[[j]]
if(is.atomic(xj)) {
## For atomic ("character") columns, things are simple ...
i <- !is.na(xj)
out[i, j] <- fmt(nmx[j], xj[i], foldable[j])
}
else {
## Should be a list ...
nmxj <- nmx[j]
fold <- foldable[j]
i <- !vapply(xj, function(s) (length(s) == 1L) && is.na(s), NA)
out[i, j] <-
vapply(xj[i],
function(s) {
paste(fmt(nmxj, s, fold), collapse = "\n")
}, "")
}
}
out <- t(out)
is_not_empty <- nzchar(out)
eor <- character(sum(is_not_empty))
if(length(eor)) {
## Newline for end of record.
## Note that we do not write a trailing blank line.
eor[ which(diff(c(col(out))[is_not_empty]) >= 1L) ] <- "\n"
}
writeLines(paste0(c(out[is_not_empty]), eor), file, useBytes=useBytes)
}