| # File src/library/tools/R/urltools.R |
| # Part of the R package, https://www.R-project.org |
| # |
| # Copyright (C) 2015-2019 The R Core Team |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # A copy of the GNU General Public License is available at |
| # https://www.R-project.org/Licenses/ |
| |
| get_IANA_URI_scheme_db <- |
| function() |
| { |
| ## See |
| ## <http://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>. |
| baseurl <- "http://www.iana.org/assignments/uri-schemes/" |
| db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")), |
| stringsAsFactors = FALSE, encoding = "UTF-8") |
| names(db) <- chartr(".", "_", names(db)) |
| db |
| } |
| |
| parse_URI_reference <- |
| function(x) |
| { |
| ## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>. |
| re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" |
| if(length(x)) { |
| y <- do.call(rbind, regmatches(x, regexec(re, x))) |
| y <- y[, c(3, 5, 6, 8, 10), drop = FALSE] |
| } else { |
| y <- matrix(character(), 0L, 5L) |
| } |
| colnames(y) <- c("scheme", "authority", "path", "query", "fragment") |
| y |
| } |
| |
| .get_urls_from_Rd <- |
| function(x) |
| { |
| urls <- character() |
| recurse <- function(e) { |
| tag <- attr(e, "Rd_tag") |
| ## Rd2HTML and Rd2latex remove whitespace and \n from URLs. |
| if(identical(tag, "\\url")) { |
| urls <<- |
| c(urls, trimws(gsub("\n", "", .Rd_deparse(e, tag = FALSE), |
| fixed = TRUE, useBytes = TRUE))) |
| } else if(identical(tag, "\\href")) { |
| urls <<- |
| c(urls, trimws(gsub("\n", "", |
| .Rd_deparse(e[[1L]], tag = FALSE), |
| fixed = TRUE, useBytes = TRUE))) |
| } else if(is.list(e)) |
| lapply(e, recurse) |
| } |
| lapply(x, recurse) |
| unique(trimws(urls)) |
| } |
| |
| .get_urls_from_HTML_file <- |
| function(f) |
| { |
| doc <- xml2::read_html(f) |
| if(!inherits(doc, "xml_node")) return(character()) |
| nodes <- xml2::xml_find_all(doc, "//a") |
| hrefs <- xml2::xml_attr(nodes, "href") |
| unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")]) |
| } |
| |
| .get_urls_from_PDF_file <- |
| function(f) |
| { |
| ## Seems there is no straightforward way to extract hyperrefs from a |
| ## PDF, hence first convert to HTML. |
| ## Note that pdftohtml always outputs in cwd ... |
| owd <- getwd() |
| dir.create(d <- tempfile()) |
| on.exit({ unlink(d, recursive = TRUE); setwd(owd) }) |
| file.copy(normalizePath(f), d) |
| setwd(d) |
| g <- tempfile(tmpdir = d, fileext = ".xml") |
| system2("pdftohtml", |
| c("-s -q -i -c -xml", basename(f), basename(g))) |
| ## Oh dear: seems that pdftohtml can fail without a non-zero exit |
| ## status. |
| if(file.exists(g)) |
| .get_urls_from_HTML_file(g) |
| else |
| character() |
| } |
| |
| url_db <- |
| function(urls, parents) |
| { |
| ## Some people get leading LFs in URLs, so trim before checking. |
| db <- data.frame(URL = trimws(as.character(urls)), |
| Parent = as.character(parents), |
| stringsAsFactors = FALSE) |
| class(db) <- c("url_db", "data.frame") |
| db |
| } |
| |
| url_db_from_HTML_files <- |
| function(dir, recursive = FALSE, files = NULL, verbose = FALSE) |
| { |
| urls <- parents <- character() |
| if(is.null(files)) |
| files <- list.files(dir, pattern = "[.]html$", |
| full.names = TRUE, |
| recursive = recursive) |
| urls <- |
| lapply(files, |
| function(f) { |
| if(verbose) |
| message(sprintf("processing %s", |
| .file_path_relative_to_dir(f, dir))) |
| .get_urls_from_HTML_file(f) |
| }) |
| names(urls) <- files |
| urls <- Filter(length, urls) |
| if(length(urls)) { |
| parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), |
| lengths(urls)) |
| urls <- unlist(urls, use.names = FALSE) |
| } |
| url_db(urls, parents) |
| } |
| |
| url_db_from_PDF_files <- |
| function(dir, recursive = FALSE, files = NULL, verbose = FALSE) |
| { |
| urls <- parents <- character() |
| if(is.null(files)) |
| files <- list.files(dir, pattern = "[.]pdf$", |
| full.names = TRUE, |
| recursive = recursive) |
| ## FIXME: this is simpler to do with full.names = FALSE and without |
| ## tools:::.file_path_relative_to_dir(). |
| urls <- |
| lapply(files, |
| function(f) { |
| if(verbose) |
| message(sprintf("processing %s", |
| .file_path_relative_to_dir(f, dir))) |
| .get_urls_from_PDF_file(f) |
| }) |
| names(urls) <- files |
| urls <- Filter(length, urls) |
| if(length(urls)) { |
| parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), |
| lengths(urls)) |
| urls <- unlist(urls, use.names = FALSE) |
| } |
| url_db(urls, parents) |
| } |
| |
| url_db_from_package_Rd_db <- |
| function(db) |
| { |
| urls <- Filter(length, lapply(db, .get_urls_from_Rd)) |
| url_db(unlist(urls, use.names = FALSE), |
| rep.int(file.path("man", names(urls)), |
| lengths(urls))) |
| } |
| |
| url_db_from_package_metadata <- |
| function(meta) |
| { |
| urls <- character() |
| fields <- c("URL", "BugReports") |
| for(v in meta[fields]) { |
| if(is.na(v)) next |
| pattern <- |
| "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]>" |
| m <- gregexpr(pattern, v) |
| urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) |
| regmatches(v, m) <- "" |
| pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)" |
| m <- gregexpr(pattern, v) |
| urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) |
| } |
| |
| url_db(urls, rep.int("DESCRIPTION", length(urls))) |
| } |
| |
| url_db_from_package_citation <- |
| function(dir, meta, installed = FALSE) |
| { |
| urls <- character() |
| path <- if(installed) "CITATION" else file.path("inst", "CITATION") |
| cfile <- file.path(dir, path) |
| if(file.exists(cfile)) { |
| cinfo <- .read_citation_quietly(cfile, meta) |
| if(!inherits(cinfo, "error")) |
| urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE))) |
| } |
| url_db(urls, rep.int(path, length(urls))) |
| } |
| |
| url_db_from_package_news <- |
| function(dir, installed = FALSE) |
| { |
| path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd") |
| nfile <- file.path(dir, path) |
| urls <- |
| if(file.exists(nfile)) { |
| macros <- initialRdMacros() |
| .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros), |
| stages = "install")) |
| } else character() |
| url_db(urls, rep.int(path, length(urls))) |
| } |
| |
| url_db_from_package_HTML_files <- |
| function(dir, installed = FALSE) |
| { |
| path <- if(installed) "doc" else file.path("inst", "doc") |
| files <- Sys.glob(file.path(dir, path, "*.html")) |
| if(installed && file.exists(rfile <- file.path(dir, "README.html"))) |
| files <- c(files, rfile) |
| url_db_from_HTML_files(dir, files = files) |
| } |
| |
| url_db_from_package_README_md <- |
| function(dir, installed = FALSE) |
| { |
| urls <- path <- character() |
| rfile <- Filter(file.exists, |
| c(if(!installed) |
| file.path(dir, "inst", "README.md"), |
| file.path(dir, "README.md")))[1L] |
| if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) { |
| path <- .file_path_relative_to_dir(rfile, dir) |
| tfile <- tempfile("README", fileext = ".html") |
| on.exit(unlink(tfile)) |
| out <- .pandoc_md_for_CRAN(rfile, tfile) |
| if(!out$status) { |
| urls <- .get_urls_from_HTML_file(tfile) |
| } |
| } |
| url_db(urls, rep.int(path, length(urls))) |
| } |
| |
| url_db_from_package_NEWS_md <- |
| function(dir, installed = FALSE) |
| { |
| urls <- path <- character() |
| nfile <- Filter(file.exists, |
| c(if(!installed) |
| file.path(dir, "inst", "NEWS.md"), |
| file.path(dir, "NEWS.md")))[1L] |
| if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) { |
| path <- .file_path_relative_to_dir(nfile, dir) |
| tfile <- tempfile("NEWS", fileext = ".html") |
| on.exit(unlink(tfile)) |
| out <- .pandoc_md_for_CRAN(nfile, tfile) |
| if(!out$status) { |
| urls <- .get_urls_from_HTML_file(tfile) |
| } |
| } |
| url_db(urls, rep.int(path, length(urls))) |
| } |
| |
| url_db_from_package_sources <- |
| function(dir, add = FALSE) { |
| meta <- .read_description(file.path(dir, "DESCRIPTION")) |
| db <- rbind(url_db_from_package_metadata(meta), |
| url_db_from_package_Rd_db(Rd_db(dir = dir)), |
| url_db_from_package_citation(dir, meta), |
| url_db_from_package_news(dir)) |
| if(requireNamespace("xml2", quietly = TRUE)) { |
| db <- rbind(db, |
| url_db_from_package_HTML_files(dir), |
| url_db_from_package_README_md(dir), |
| url_db_from_package_NEWS_md(dir) |
| ) |
| } |
| if(add) |
| db$Parent <- file.path(basename(dir), db$Parent) |
| db |
| } |
| |
| url_db_from_installed_packages <- |
| function(packages, lib.loc = NULL, verbose = FALSE) |
| { |
| if(!length(packages)) return() |
| one <- function(p) { |
| if(verbose) |
| message(sprintf("processing %s", p)) |
| dir <- system.file(package = p, lib.loc = lib.loc) |
| if(dir == "") return() |
| meta <- .read_description(file.path(dir, "DESCRIPTION")) |
| rddb <- Rd_db(p, lib.loc = dirname(dir)) |
| db <- rbind(url_db_from_package_metadata(meta), |
| url_db_from_package_Rd_db(rddb), |
| url_db_from_package_citation(dir, meta, |
| installed = TRUE), |
| url_db_from_package_news(dir, installed = TRUE)) |
| if(requireNamespace("xml2", quietly = TRUE)) { |
| db <- rbind(db, |
| url_db_from_package_HTML_files(dir, |
| installed = TRUE), |
| url_db_from_package_README_md(dir, |
| installed = TRUE), |
| url_db_from_package_NEWS_md(dir, |
| installed = TRUE) |
| ) |
| } |
| db$Parent <- file.path(p, db$Parent) |
| db |
| } |
| do.call(rbind, |
| c(lapply(packages, one), |
| list(make.row.names = FALSE))) |
| } |
| |
| get_IANA_HTTP_status_code_db <- |
| function() |
| { |
| ## See |
| ## <http://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml> |
| baseurl <- "http://www.iana.org/assignments/http-status-codes/" |
| db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")), |
| stringsAsFactors = FALSE) |
| ## Drop "Unassigned". |
| db[db$Description != "Unassigned", ] |
| } |
| |
| ## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes> |
| ## and <http://tools.ietf.org/html/rfc959>, |
| ## Section 4.2.2 "Numeric Order List of Reply Codes", |
| ## and <https://tools.ietf.org/html/rfc2228>, |
| ## Section 5 "New FTP Replies". |
| ## Only need those >= 400. |
| table_of_FTP_server_return_codes <- |
| c("421" = "Service not available, closing control connection.", |
| "425" = "Can't open data connection.", |
| "426" = "Connection closed; transfer aborted.", |
| "430" = "Invalid username or password", |
| "431" = "Need some unavailable resource to process security.", |
| "434" = "Requested host unavailable.", |
| "450" = "Requested file action not taken.", |
| "451" = "Requested action aborted: local error in processing.", |
| "452" = "Requested action not taken. Insufficient storage space in system.", |
| "500" = "Syntax error, command unrecognized.", |
| "501" = "Syntax error in parameters or arguments.", |
| "502" = "Command not implemented.", |
| "503" = "Bad sequence of commands.", |
| "504" = "Command not implemented for that parameter.", |
| "530" = "Not logged in.", |
| "532" = "Need account for storing files.", |
| "533" = "Command protection level denied for policy reasons.", |
| "534" = "Request denied for policy reasons.", |
| "535" = "Failed security check (hash, sequence, etc).", |
| "536" = "Requested PROT level not supported by mechanism.", |
| "537" = "Command protection level not supported by security mechanism.", |
| "550" = "Requested action not taken. File unavailable", |
| "551" = "Requested action aborted: page type unknown.", |
| "552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).", |
| "553" = "Requested action not taken. File name not allowed.", |
| "631" = "Integrity protected reply.", |
| "632" = "Confidentiality and integrity protected reply.", |
| "633" = "Confidentiality protected reply." |
| ) |
| |
| check_url_db <- |
| function(db, remote = TRUE, verbose = FALSE) |
| { |
| use_curl <- |
| config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_", |
| "TRUE")) && |
| requireNamespace("curl", quietly = TRUE) |
| |
| .gather <- function(u = character(), |
| p = list(), |
| s = rep.int("", length(u)), |
| m = rep.int("", length(u)), |
| new = rep.int("", length(u)), |
| cran = rep.int("", length(u)), |
| spaces = rep.int("", length(u)), |
| R = rep.int("", length(u))) { |
| y <- data.frame(URL = u, From = I(p), Status = s, Message = m, |
| New = new, CRAN = cran, Spaces = spaces, R = R, |
| row.names = NULL, stringsAsFactors = FALSE) |
| y$From <- p |
| class(y) <- c("check_url_db", "data.frame") |
| y |
| } |
| |
| .fetch <- function(u) { |
| if(verbose) message(sprintf("processing %s", u)) |
| h <- tryCatch(curlGetHeaders(u), error = identity) |
| if(inherits(h, "error")) { |
| ## Currently, this info is only used in .check_http(). |
| ## Might be useful for checking ftps too, so simply leave it |
| ## here instead of moving to .check_http(). |
| msg <- conditionMessage(h) |
| if (grepl("libcurl error code (51|60)", msg)) { |
| h2 <- tryCatch(curlGetHeaders(u, verify = FALSE), |
| error = identity) |
| attr(h, "no-verify") <- h2 |
| } |
| } |
| h |
| } |
| |
| .check_ftp <- function(u) { |
| h <- .fetch(u) |
| if(inherits(h, "error")) { |
| s <- "-1" |
| msg <- sub("[[:space:]]*$", "", conditionMessage(h)) |
| } else { |
| s <- as.character(attr(h, "status")) |
| msg <- table_of_FTP_server_return_codes[s] |
| } |
| c(s, msg, "", "") |
| } |
| |
| .check_http <- if(remote) |
| function(u) c(.check_http_A(u), .check_http_B(u)) |
| else |
| function(u) c(rep.int("", 3L), .check_http_B(u)) |
| |
| .check_http_A <- function(u) { |
| h <- .fetch(u) |
| newLoc <- "" |
| if(inherits(h, "error")) { |
| s <- "-1" |
| msg <- sub("[[:space:]]*$", "", conditionMessage(h)) |
| if (!is.null(v <- attr(h, "no-verify"))) { |
| s2 <- as.character(attr(v, "status")) |
| msg <- paste0(msg, "\n\t(Status without verification: ", |
| table_of_HTTP_status_codes[s2], ")") |
| } |
| } else { |
| s <- as.character(attr(h, "status")) |
| msg <- table_of_HTTP_status_codes[s] |
| } |
| ## Look for redirected URLs |
| if (any(grepl("301 Moved Permanently", h, useBytes = TRUE))) { |
| ind <- grep("^[Ll]ocation: ", h, useBytes = TRUE) |
| if (length(ind)) |
| newLoc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[max(ind)]) |
| } |
| ## |
| if((s != "200") && use_curl) { |
| g <- .curl_GET_status(u) |
| if(g == "200") { |
| s <- g |
| msg <- "OK" |
| } |
| } |
| ## A mis-configured site |
| if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc)))) |
| s <- "405" |
| c(s, msg, newLoc) |
| } |
| |
| .check_http_B <- function(u) { |
| ul <- tolower(u) |
| cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) && |
| !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]]+(html|pdf|rds)$", |
| ul)) || |
| (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$", |
| ul)) || |
| startsWith(ul, "http://cran.r-project.org") || |
| any(startsWith(ul, mirrors))) |
| R <- grepl("^http://(www|bugs|journal).r-project.org", ul) |
| spaces <- grepl(" ", u) |
| c(if(cran) u else "", if(spaces) u else "", if(R) u else "") |
| } |
| |
| bad <- .gather() |
| |
| if(!NROW(db)) return(bad) |
| |
| ## Could also use utils::getCRANmirrors(local.only = TRUE). |
| mirrors <- c(utils::read.csv(file.path(R.home("doc"), |
| "CRAN_mirrors.csv"), |
| as.is = TRUE, encoding = "UTF-8")$URL, |
| "http://cran.rstudio.com/", |
| "https://cran.rstudio.com/") |
| mirrors <- tolower(sub("/$", "", mirrors)) |
| |
| if(inherits(db, "check_url_db")) { |
| ## Allow re-checking check results. |
| parents <- db$From |
| urls <- db$URL |
| } else { |
| parents <- split(db$Parent, db$URL) |
| urls <- names(parents) |
| } |
| |
| parts <- parse_URI_reference(urls) |
| |
| ## Empty URLs. |
| ind <- apply(parts == "", 1L, all) |
| if(any(ind)) { |
| len <- sum(ind) |
| bad <- rbind(bad, |
| .gather(urls[ind], |
| parents[ind], |
| m = rep.int("Empty URL", len))) |
| } |
| |
| ## Invalid URI schemes. |
| schemes <- parts[, 1L] |
| ind <- is.na(match(schemes, |
| c("", |
| IANA_URI_scheme_db$URI_Scheme, |
| ## Also allow 'javascript' scheme, see |
| ## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03> |
| ## (but apparently never registered with IANA). |
| "javascript"))) |
| if(any(ind)) { |
| len <- sum(ind) |
| msg <- rep.int("Invalid URI scheme", len) |
| doi <- schemes[ind] == "doi" |
| if(any(doi)) |
| msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)") |
| bad <- rbind(bad, |
| .gather(urls[ind], parents[ind], m = msg)) |
| } |
| |
| ## ftp. |
| pos <- which(schemes == "ftp") |
| if(length(pos) && remote) { |
| results <- do.call(rbind, lapply(urls[pos], .check_ftp)) |
| status <- as.numeric(results[, 1L]) |
| ind <- (status < 0L) | (status >= 400L) |
| if(any(ind)) { |
| pos <- pos[ind] |
| s <- as.character(status[ind]) |
| s[s == "-1"] <- "Error" |
| m <- results[ind, 2L] |
| m[is.na(m)] <- "" |
| bad <- rbind(bad, |
| .gather(urls[pos], parents[pos], s, m)) |
| } |
| } |
| |
| ## http/https. |
| pos <- which(schemes == "http" | schemes == "https") |
| if(length(pos)) { |
| results <- do.call(rbind, lapply(urls[pos], .check_http)) |
| status <- as.numeric(results[, 1L]) |
| ## 405 is HTTP not allowing HEAD requests |
| ## maybe also skip 500, 503, 504 as likely to be temporary issues |
| ind <- is.na(match(status, c(200L, 405L, NA))) | |
| nzchar(results[, 4L]) | |
| nzchar(results[, 5L]) | |
| nzchar(results[, 6L]) |
| if(any(ind)) { |
| pos <- pos[ind] |
| s <- as.character(status[ind]) |
| s[is.na(s)] <- "" |
| s[s == "-1"] <- "Error" |
| m <- results[ind, 2L] |
| m[is.na(m)] <- "" |
| bad <- rbind(bad, |
| .gather(urls[pos], parents[pos], s, m, |
| results[ind, 3L], |
| results[ind, 4L], |
| results[ind, 5L], |
| results[ind, 6L])) |
| } |
| } |
| bad |
| } |
| |
| format.check_url_db <- |
| function(x, ...) |
| { |
| if(!NROW(x)) return(character()) |
| |
| u <- x$URL |
| new <- x$New |
| ind <- nzchar(new) |
| u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind]) |
| |
| paste0(sprintf("URL: %s", u), |
| sprintf("\nFrom: %s", |
| vapply(x$From, paste, "", collapse = "\n ")), |
| ifelse((s <- x$Status) == "", |
| "", |
| sprintf("\nStatus: %s", s)), |
| ifelse((m <- x$Message) == "", |
| "", |
| sprintf("\nMessage: %s", gsub("\n", "\n ", m))), |
| ifelse((m <- x$Spaces) == "", |
| "", |
| "\nURL contains spaces"), |
| ifelse((m <- x$CRAN) == "", |
| "", |
| "\nCRAN URL not in canonical form"), |
| ifelse((m <- x$R) == "", |
| "", |
| "\nR-project URL not in canonical form") |
| ) |
| } |
| |
| print.check_url_db <- |
| function(x, ...) |
| { |
| if(NROW(x)) |
| writeLines(paste(format(x), collapse = "\n\n")) |
| invisible(x) |
| } |
| |
| as.matrix.check_url_db <- |
| function(x, ...) |
| { |
| n <- lengths(x[["From"]]) |
| y <- do.call(cbind, |
| c(list(URL = rep.int(x[["URL"]], n), |
| Parent = unlist(x[["From"]])), |
| lapply(x[-c(1L, 2L)], rep.int, n))) |
| rownames(y) <- NULL |
| y |
| } |
| |
| .curl_GET_status <- |
| function(u, verbose = FALSE) |
| { |
| if(verbose) |
| message(sprintf("processing %s", u)) |
| ## Configure curl handle for better luck with JSTOR URLs/DOIs. |
| ## Alternatively, special-case requests to |
| ## https?://doi.org/10.2307 |
| ## https?://www.jstor.org |
| h <- curl::new_handle() |
| curl::handle_setopt(h, |
| cookiesession = 1, |
| followlocation = 1, |
| http_version = 2L, |
| ssl_enable_alpn = 0) |
| g <- tryCatch(curl::curl_fetch_memory(u, handle = h), |
| error = identity) |
| if(inherits(g, "error")) |
| -1L |
| else |
| g$status_code |
| } |