blob: 2d8bf0ca7f59f3c86c7fb6b398e15351056e857b [file] [log] [blame]
# File src/library/tools/R/urltools.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 2015-2019 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/
get_IANA_URI_scheme_db <-
function()
{
## See
## <http://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
baseurl <- "http://www.iana.org/assignments/uri-schemes/"
db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
stringsAsFactors = FALSE, encoding = "UTF-8")
names(db) <- chartr(".", "_", names(db))
db
}
parse_URI_reference <-
function(x)
{
## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>.
re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
if(length(x)) {
y <- do.call(rbind, regmatches(x, regexec(re, x)))
y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
} else {
y <- matrix(character(), 0L, 5L)
}
colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
y
}
.get_urls_from_Rd <-
function(x)
{
urls <- character()
recurse <- function(e) {
tag <- attr(e, "Rd_tag")
## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
if(identical(tag, "\\url")) {
urls <<-
c(urls, trimws(gsub("\n", "", .Rd_deparse(e, tag = FALSE),
fixed = TRUE, useBytes = TRUE)))
} else if(identical(tag, "\\href")) {
urls <<-
c(urls, trimws(gsub("\n", "",
.Rd_deparse(e[[1L]], tag = FALSE),
fixed = TRUE, useBytes = TRUE)))
} else if(is.list(e))
lapply(e, recurse)
}
lapply(x, recurse)
unique(trimws(urls))
}
.get_urls_from_HTML_file <-
function(f)
{
doc <- xml2::read_html(f)
if(!inherits(doc, "xml_node")) return(character())
nodes <- xml2::xml_find_all(doc, "//a")
hrefs <- xml2::xml_attr(nodes, "href")
unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
}
.get_urls_from_PDF_file <-
function(f)
{
## Seems there is no straightforward way to extract hyperrefs from a
## PDF, hence first convert to HTML.
## Note that pdftohtml always outputs in cwd ...
owd <- getwd()
dir.create(d <- tempfile())
on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
file.copy(normalizePath(f), d)
setwd(d)
g <- tempfile(tmpdir = d, fileext = ".xml")
system2("pdftohtml",
c("-s -q -i -c -xml", basename(f), basename(g)))
## Oh dear: seems that pdftohtml can fail without a non-zero exit
## status.
if(file.exists(g))
.get_urls_from_HTML_file(g)
else
character()
}
url_db <-
function(urls, parents)
{
## Some people get leading LFs in URLs, so trim before checking.
db <- data.frame(URL = trimws(as.character(urls)),
Parent = as.character(parents),
stringsAsFactors = FALSE)
class(db) <- c("url_db", "data.frame")
db
}
url_db_from_HTML_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
urls <- parents <- character()
if(is.null(files))
files <- list.files(dir, pattern = "[.]html$",
full.names = TRUE,
recursive = recursive)
urls <-
lapply(files,
function(f) {
if(verbose)
message(sprintf("processing %s",
.file_path_relative_to_dir(f, dir)))
.get_urls_from_HTML_file(f)
})
names(urls) <- files
urls <- Filter(length, urls)
if(length(urls)) {
parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
lengths(urls))
urls <- unlist(urls, use.names = FALSE)
}
url_db(urls, parents)
}
url_db_from_PDF_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
urls <- parents <- character()
if(is.null(files))
files <- list.files(dir, pattern = "[.]pdf$",
full.names = TRUE,
recursive = recursive)
## FIXME: this is simpler to do with full.names = FALSE and without
## tools:::.file_path_relative_to_dir().
urls <-
lapply(files,
function(f) {
if(verbose)
message(sprintf("processing %s",
.file_path_relative_to_dir(f, dir)))
.get_urls_from_PDF_file(f)
})
names(urls) <- files
urls <- Filter(length, urls)
if(length(urls)) {
parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
lengths(urls))
urls <- unlist(urls, use.names = FALSE)
}
url_db(urls, parents)
}
url_db_from_package_Rd_db <-
function(db)
{
urls <- Filter(length, lapply(db, .get_urls_from_Rd))
url_db(unlist(urls, use.names = FALSE),
rep.int(file.path("man", names(urls)),
lengths(urls)))
}
url_db_from_package_metadata <-
function(meta)
{
urls <- character()
fields <- c("URL", "BugReports")
for(v in meta[fields]) {
if(is.na(v)) next
pattern <-
"<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]>"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
regmatches(v, m) <- ""
pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
}
url_db(urls, rep.int("DESCRIPTION", length(urls)))
}
url_db_from_package_citation <-
function(dir, meta, installed = FALSE)
{
urls <- character()
path <- if(installed) "CITATION" else file.path("inst", "CITATION")
cfile <- file.path(dir, path)
if(file.exists(cfile)) {
cinfo <- .read_citation_quietly(cfile, meta)
if(!inherits(cinfo, "error"))
urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_news <-
function(dir, installed = FALSE)
{
path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
nfile <- file.path(dir, path)
urls <-
if(file.exists(nfile)) {
macros <- initialRdMacros()
.get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros),
stages = "install"))
} else character()
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_HTML_files <-
function(dir, installed = FALSE)
{
path <- if(installed) "doc" else file.path("inst", "doc")
files <- Sys.glob(file.path(dir, path, "*.html"))
if(installed && file.exists(rfile <- file.path(dir, "README.html")))
files <- c(files, rfile)
url_db_from_HTML_files(dir, files = files)
}
url_db_from_package_README_md <-
function(dir, installed = FALSE)
{
urls <- path <- character()
rfile <- Filter(file.exists,
c(if(!installed)
file.path(dir, "inst", "README.md"),
file.path(dir, "README.md")))[1L]
if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
path <- .file_path_relative_to_dir(rfile, dir)
tfile <- tempfile("README", fileext = ".html")
on.exit(unlink(tfile))
out <- .pandoc_md_for_CRAN(rfile, tfile)
if(!out$status) {
urls <- .get_urls_from_HTML_file(tfile)
}
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_NEWS_md <-
function(dir, installed = FALSE)
{
urls <- path <- character()
nfile <- Filter(file.exists,
c(if(!installed)
file.path(dir, "inst", "NEWS.md"),
file.path(dir, "NEWS.md")))[1L]
if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
path <- .file_path_relative_to_dir(nfile, dir)
tfile <- tempfile("NEWS", fileext = ".html")
on.exit(unlink(tfile))
out <- .pandoc_md_for_CRAN(nfile, tfile)
if(!out$status) {
urls <- .get_urls_from_HTML_file(tfile)
}
}
url_db(urls, rep.int(path, length(urls)))
}
url_db_from_package_sources <-
function(dir, add = FALSE) {
meta <- .read_description(file.path(dir, "DESCRIPTION"))
db <- rbind(url_db_from_package_metadata(meta),
url_db_from_package_Rd_db(Rd_db(dir = dir)),
url_db_from_package_citation(dir, meta),
url_db_from_package_news(dir))
if(requireNamespace("xml2", quietly = TRUE)) {
db <- rbind(db,
url_db_from_package_HTML_files(dir),
url_db_from_package_README_md(dir),
url_db_from_package_NEWS_md(dir)
)
}
if(add)
db$Parent <- file.path(basename(dir), db$Parent)
db
}
url_db_from_installed_packages <-
function(packages, lib.loc = NULL, verbose = FALSE)
{
if(!length(packages)) return()
one <- function(p) {
if(verbose)
message(sprintf("processing %s", p))
dir <- system.file(package = p, lib.loc = lib.loc)
if(dir == "") return()
meta <- .read_description(file.path(dir, "DESCRIPTION"))
rddb <- Rd_db(p, lib.loc = dirname(dir))
db <- rbind(url_db_from_package_metadata(meta),
url_db_from_package_Rd_db(rddb),
url_db_from_package_citation(dir, meta,
installed = TRUE),
url_db_from_package_news(dir, installed = TRUE))
if(requireNamespace("xml2", quietly = TRUE)) {
db <- rbind(db,
url_db_from_package_HTML_files(dir,
installed = TRUE),
url_db_from_package_README_md(dir,
installed = TRUE),
url_db_from_package_NEWS_md(dir,
installed = TRUE)
)
}
db$Parent <- file.path(p, db$Parent)
db
}
do.call(rbind,
c(lapply(packages, one),
list(make.row.names = FALSE)))
}
get_IANA_HTTP_status_code_db <-
function()
{
## See
## <http://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
baseurl <- "http://www.iana.org/assignments/http-status-codes/"
db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")),
stringsAsFactors = FALSE)
## Drop "Unassigned".
db[db$Description != "Unassigned", ]
}
## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
## and <http://tools.ietf.org/html/rfc959>,
## Section 4.2.2 "Numeric Order List of Reply Codes",
## and <https://tools.ietf.org/html/rfc2228>,
## Section 5 "New FTP Replies".
## Only need those >= 400.
table_of_FTP_server_return_codes <-
c("421" = "Service not available, closing control connection.",
"425" = "Can't open data connection.",
"426" = "Connection closed; transfer aborted.",
"430" = "Invalid username or password",
"431" = "Need some unavailable resource to process security.",
"434" = "Requested host unavailable.",
"450" = "Requested file action not taken.",
"451" = "Requested action aborted: local error in processing.",
"452" = "Requested action not taken. Insufficient storage space in system.",
"500" = "Syntax error, command unrecognized.",
"501" = "Syntax error in parameters or arguments.",
"502" = "Command not implemented.",
"503" = "Bad sequence of commands.",
"504" = "Command not implemented for that parameter.",
"530" = "Not logged in.",
"532" = "Need account for storing files.",
"533" = "Command protection level denied for policy reasons.",
"534" = "Request denied for policy reasons.",
"535" = "Failed security check (hash, sequence, etc).",
"536" = "Requested PROT level not supported by mechanism.",
"537" = "Command protection level not supported by security mechanism.",
"550" = "Requested action not taken. File unavailable",
"551" = "Requested action aborted: page type unknown.",
"552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).",
"553" = "Requested action not taken. File name not allowed.",
"631" = "Integrity protected reply.",
"632" = "Confidentiality and integrity protected reply.",
"633" = "Confidentiality protected reply."
)
check_url_db <-
function(db, remote = TRUE, verbose = FALSE)
{
use_curl <-
config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_",
"TRUE")) &&
requireNamespace("curl", quietly = TRUE)
.gather <- function(u = character(),
p = list(),
s = rep.int("", length(u)),
m = rep.int("", length(u)),
new = rep.int("", length(u)),
cran = rep.int("", length(u)),
spaces = rep.int("", length(u)),
R = rep.int("", length(u))) {
y <- data.frame(URL = u, From = I(p), Status = s, Message = m,
New = new, CRAN = cran, Spaces = spaces, R = R,
row.names = NULL, stringsAsFactors = FALSE)
y$From <- p
class(y) <- c("check_url_db", "data.frame")
y
}
.fetch <- function(u) {
if(verbose) message(sprintf("processing %s", u))
h <- tryCatch(curlGetHeaders(u), error = identity)
if(inherits(h, "error")) {
## Currently, this info is only used in .check_http().
## Might be useful for checking ftps too, so simply leave it
## here instead of moving to .check_http().
msg <- conditionMessage(h)
if (grepl("libcurl error code (51|60)", msg)) {
h2 <- tryCatch(curlGetHeaders(u, verify = FALSE),
error = identity)
attr(h, "no-verify") <- h2
}
}
h
}
.check_ftp <- function(u) {
h <- .fetch(u)
if(inherits(h, "error")) {
s <- "-1"
msg <- sub("[[:space:]]*$", "", conditionMessage(h))
} else {
s <- as.character(attr(h, "status"))
msg <- table_of_FTP_server_return_codes[s]
}
c(s, msg, "", "")
}
.check_http <- if(remote)
function(u) c(.check_http_A(u), .check_http_B(u))
else
function(u) c(rep.int("", 3L), .check_http_B(u))
.check_http_A <- function(u) {
h <- .fetch(u)
newLoc <- ""
if(inherits(h, "error")) {
s <- "-1"
msg <- sub("[[:space:]]*$", "", conditionMessage(h))
if (!is.null(v <- attr(h, "no-verify"))) {
s2 <- as.character(attr(v, "status"))
msg <- paste0(msg, "\n\t(Status without verification: ",
table_of_HTTP_status_codes[s2], ")")
}
} else {
s <- as.character(attr(h, "status"))
msg <- table_of_HTTP_status_codes[s]
}
## Look for redirected URLs
if (any(grepl("301 Moved Permanently", h, useBytes = TRUE))) {
ind <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
if (length(ind))
newLoc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[max(ind)])
}
##
if((s != "200") && use_curl) {
g <- .curl_GET_status(u)
if(g == "200") {
s <- g
msg <- "OK"
}
}
## A mis-configured site
if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
s <- "405"
c(s, msg, newLoc)
}
.check_http_B <- function(u) {
ul <- tolower(u)
cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
!grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]]+(html|pdf|rds)$",
ul)) ||
(grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
ul)) ||
startsWith(ul, "http://cran.r-project.org") ||
any(startsWith(ul, mirrors)))
R <- grepl("^http://(www|bugs|journal).r-project.org", ul)
spaces <- grepl(" ", u)
c(if(cran) u else "", if(spaces) u else "", if(R) u else "")
}
bad <- .gather()
if(!NROW(db)) return(bad)
## Could also use utils::getCRANmirrors(local.only = TRUE).
mirrors <- c(utils::read.csv(file.path(R.home("doc"),
"CRAN_mirrors.csv"),
as.is = TRUE, encoding = "UTF-8")$URL,
"http://cran.rstudio.com/",
"https://cran.rstudio.com/")
mirrors <- tolower(sub("/$", "", mirrors))
if(inherits(db, "check_url_db")) {
## Allow re-checking check results.
parents <- db$From
urls <- db$URL
} else {
parents <- split(db$Parent, db$URL)
urls <- names(parents)
}
parts <- parse_URI_reference(urls)
## Empty URLs.
ind <- apply(parts == "", 1L, all)
if(any(ind)) {
len <- sum(ind)
bad <- rbind(bad,
.gather(urls[ind],
parents[ind],
m = rep.int("Empty URL", len)))
}
## Invalid URI schemes.
schemes <- parts[, 1L]
ind <- is.na(match(schemes,
c("",
IANA_URI_scheme_db$URI_Scheme,
## Also allow 'javascript' scheme, see
## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
## (but apparently never registered with IANA).
"javascript")))
if(any(ind)) {
len <- sum(ind)
msg <- rep.int("Invalid URI scheme", len)
doi <- schemes[ind] == "doi"
if(any(doi))
msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
bad <- rbind(bad,
.gather(urls[ind], parents[ind], m = msg))
}
## ftp.
pos <- which(schemes == "ftp")
if(length(pos) && remote) {
results <- do.call(rbind, lapply(urls[pos], .check_ftp))
status <- as.numeric(results[, 1L])
ind <- (status < 0L) | (status >= 400L)
if(any(ind)) {
pos <- pos[ind]
s <- as.character(status[ind])
s[s == "-1"] <- "Error"
m <- results[ind, 2L]
m[is.na(m)] <- ""
bad <- rbind(bad,
.gather(urls[pos], parents[pos], s, m))
}
}
## http/https.
pos <- which(schemes == "http" | schemes == "https")
if(length(pos)) {
results <- do.call(rbind, lapply(urls[pos], .check_http))
status <- as.numeric(results[, 1L])
## 405 is HTTP not allowing HEAD requests
## maybe also skip 500, 503, 504 as likely to be temporary issues
ind <- is.na(match(status, c(200L, 405L, NA))) |
nzchar(results[, 4L]) |
nzchar(results[, 5L]) |
nzchar(results[, 6L])
if(any(ind)) {
pos <- pos[ind]
s <- as.character(status[ind])
s[is.na(s)] <- ""
s[s == "-1"] <- "Error"
m <- results[ind, 2L]
m[is.na(m)] <- ""
bad <- rbind(bad,
.gather(urls[pos], parents[pos], s, m,
results[ind, 3L],
results[ind, 4L],
results[ind, 5L],
results[ind, 6L]))
}
}
bad
}
format.check_url_db <-
function(x, ...)
{
if(!NROW(x)) return(character())
u <- x$URL
new <- x$New
ind <- nzchar(new)
u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])
paste0(sprintf("URL: %s", u),
sprintf("\nFrom: %s",
vapply(x$From, paste, "", collapse = "\n ")),
ifelse((s <- x$Status) == "",
"",
sprintf("\nStatus: %s", s)),
ifelse((m <- x$Message) == "",
"",
sprintf("\nMessage: %s", gsub("\n", "\n ", m))),
ifelse((m <- x$Spaces) == "",
"",
"\nURL contains spaces"),
ifelse((m <- x$CRAN) == "",
"",
"\nCRAN URL not in canonical form"),
ifelse((m <- x$R) == "",
"",
"\nR-project URL not in canonical form")
)
}
print.check_url_db <-
function(x, ...)
{
if(NROW(x))
writeLines(paste(format(x), collapse = "\n\n"))
invisible(x)
}
as.matrix.check_url_db <-
function(x, ...)
{
n <- lengths(x[["From"]])
y <- do.call(cbind,
c(list(URL = rep.int(x[["URL"]], n),
Parent = unlist(x[["From"]])),
lapply(x[-c(1L, 2L)], rep.int, n)))
rownames(y) <- NULL
y
}
.curl_GET_status <-
function(u, verbose = FALSE)
{
if(verbose)
message(sprintf("processing %s", u))
## Configure curl handle for better luck with JSTOR URLs/DOIs.
## Alternatively, special-case requests to
## https?://doi.org/10.2307
## https?://www.jstor.org
h <- curl::new_handle()
curl::handle_setopt(h,
cookiesession = 1,
followlocation = 1,
http_version = 2L,
ssl_enable_alpn = 0)
g <- tryCatch(curl::curl_fetch_memory(u, handle = h),
error = identity)
if(inherits(g, "error"))
-1L
else
g$status_code
}