src/library/tools/R/urltools.R - R - Git at Google

 #  File src/library/tools/R/urltools.R
 #  Part of the R package, https://www.R-project.org
 #
 #  Copyright (C) 2015-2019 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/

 get_IANA_URI_scheme_db <-
 function()
 {
     ## See
     ## <http://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
     baseurl <- "http://www.iana.org/assignments/uri-schemes/"
     db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
                           stringsAsFactors = FALSE, encoding = "UTF-8")
     names(db) <- chartr(".", "_", names(db))
     db
 }

 parse_URI_reference <-
 function(x)
 {
     ## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>.
     re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
     if(length(x)) {
         y <- do.call(rbind, regmatches(x, regexec(re, x)))
         y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
     } else {
         y <- matrix(character(), 0L, 5L)
     }
     colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
     y
 }

 .get_urls_from_Rd <-
 function(x)
 {
     urls <- character()
     recurse <- function(e) {
         tag <- attr(e, "Rd_tag")
         ## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
         if(identical(tag, "\\url")) {
             urls <<-
                 c(urls, trimws(gsub("\n", "", .Rd_deparse(e, tag = FALSE),
                                     fixed = TRUE, useBytes = TRUE)))
         } else if(identical(tag, "\\href")) {
             urls <<-
                 c(urls, trimws(gsub("\n", "",
                                     .Rd_deparse(e[[1L]], tag = FALSE),
                                     fixed = TRUE, useBytes = TRUE)))
         } else if(is.list(e))
               lapply(e, recurse)
     }
     lapply(x, recurse)
     unique(trimws(urls))
 }

 .get_urls_from_HTML_file <-
 function(f)
 {
     doc <- xml2::read_html(f)
     if(!inherits(doc, "xml_node")) return(character())
     nodes <- xml2::xml_find_all(doc, "//a")
     hrefs <- xml2::xml_attr(nodes, "href")
     unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
 }

 .get_urls_from_PDF_file <-
 function(f)
 {
     ## Seems there is no straightforward way to extract hyperrefs from a
     ## PDF, hence first convert to HTML.
     ## Note that pdftohtml always outputs in cwd ...
     owd <- getwd()
     dir.create(d <- tempfile())
     on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
     file.copy(normalizePath(f), d)
     setwd(d)
     g <- tempfile(tmpdir = d, fileext = ".xml")
     system2("pdftohtml",
             c("-s -q -i -c -xml", basename(f), basename(g)))
     ## Oh dear: seems that pdftohtml can fail without a non-zero exit
     ## status.
     if(file.exists(g))
         .get_urls_from_HTML_file(g)
     else
         character()
 }

 url_db <-
 function(urls, parents)
 {
     ## Some people get leading LFs in URLs, so trim before checking.
     db <- data.frame(URL = trimws(as.character(urls)),
                      Parent = as.character(parents),
                      stringsAsFactors = FALSE)
     class(db) <- c("url_db", "data.frame")
     db
 }

 url_db_from_HTML_files <-
 function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
 {
     urls <- parents <- character()
     if(is.null(files))
         files <- list.files(dir, pattern = "[.]html$",
                             full.names = TRUE,
                             recursive = recursive)
     urls <-
         lapply(files,
                function(f) {
                    if(verbose)
                        message(sprintf("processing %s",
                                        .file_path_relative_to_dir(f, dir)))
                    .get_urls_from_HTML_file(f)
                })
     names(urls) <- files
     urls <- Filter(length, urls)
     if(length(urls)) {
         parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                            lengths(urls))
         urls <- unlist(urls, use.names = FALSE)
     }
     url_db(urls, parents)
 }

 url_db_from_PDF_files <-
 function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
 {
     urls <- parents <- character()
     if(is.null(files))
         files <- list.files(dir, pattern = "[.]pdf$",
                             full.names = TRUE,
                             recursive = recursive)
     ## FIXME: this is simpler to do with full.names = FALSE and without
     ## tools:::.file_path_relative_to_dir().
     urls <-
         lapply(files,
                function(f) {
                    if(verbose)
                        message(sprintf("processing %s",
                                        .file_path_relative_to_dir(f, dir)))
                    .get_urls_from_PDF_file(f)
                })
     names(urls) <- files
     urls <- Filter(length, urls)
     if(length(urls)) {
         parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                            lengths(urls))
         urls <- unlist(urls, use.names = FALSE)
     }
     url_db(urls, parents)
 }

 url_db_from_package_Rd_db <-
 function(db)
 {
     urls <- Filter(length, lapply(db, .get_urls_from_Rd))
     url_db(unlist(urls, use.names = FALSE),
            rep.int(file.path("man", names(urls)),
                    lengths(urls)))
 }

 url_db_from_package_metadata <-
 function(meta)
 {
     urls <- character()
     fields <- c("URL", "BugReports")
     for(v in meta[fields]) {
         if(is.na(v)) next
         pattern <-
             "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]>"
         m <- gregexpr(pattern, v)
         urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
         regmatches(v, m) <- ""
         pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)"
         m <- gregexpr(pattern, v)
         urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
     }

     url_db(urls, rep.int("DESCRIPTION", length(urls)))
 }

 url_db_from_package_citation <-
 function(dir, meta, installed = FALSE)
 {
     urls <- character()
     path <- if(installed) "CITATION" else file.path("inst", "CITATION")
     cfile <- file.path(dir, path)
     if(file.exists(cfile)) {
         cinfo <- .read_citation_quietly(cfile, meta)
         if(!inherits(cinfo, "error"))
             urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
     }
     url_db(urls, rep.int(path, length(urls)))
 }

 url_db_from_package_news <-
 function(dir, installed = FALSE)
 {
     path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
     nfile <- file.path(dir, path)
     urls <-
         if(file.exists(nfile)) {
             macros <- initialRdMacros()
             .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros),
                                          stages = "install"))
         } else character()
     url_db(urls, rep.int(path, length(urls)))
 }

 url_db_from_package_HTML_files <-
 function(dir, installed = FALSE)
 {
     path <- if(installed) "doc" else file.path("inst", "doc")
     files <- Sys.glob(file.path(dir, path, "*.html"))
     if(installed && file.exists(rfile <- file.path(dir, "README.html")))
         files <- c(files, rfile)
     url_db_from_HTML_files(dir, files = files)
 }

 url_db_from_package_README_md <-
 function(dir, installed = FALSE)
 {
     urls <- path <- character()
     rfile <- Filter(file.exists,
                     c(if(!installed)
                           file.path(dir, "inst", "README.md"),
                       file.path(dir, "README.md")))[1L]
     if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
         path <- .file_path_relative_to_dir(rfile, dir)
         tfile <- tempfile("README", fileext = ".html")
         on.exit(unlink(tfile))
         out <- .pandoc_md_for_CRAN(rfile, tfile)
         if(!out$status) {
             urls <- .get_urls_from_HTML_file(tfile)
         }
     }
     url_db(urls, rep.int(path, length(urls)))
 }

 url_db_from_package_NEWS_md <-
 function(dir, installed = FALSE)
 {
     urls <- path <- character()
     nfile <- Filter(file.exists,
                     c(if(!installed)
                           file.path(dir, "inst", "NEWS.md"),
                       file.path(dir, "NEWS.md")))[1L]
     if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
         path <- .file_path_relative_to_dir(nfile, dir)
         tfile <- tempfile("NEWS", fileext = ".html")
         on.exit(unlink(tfile))
         out <- .pandoc_md_for_CRAN(nfile, tfile)
         if(!out$status) {
             urls <- .get_urls_from_HTML_file(tfile)
         }
     }
     url_db(urls, rep.int(path, length(urls)))
 }

 url_db_from_package_sources <-
 function(dir, add = FALSE) {
     meta <- .read_description(file.path(dir, "DESCRIPTION"))
     db <- rbind(url_db_from_package_metadata(meta),
                 url_db_from_package_Rd_db(Rd_db(dir = dir)),
                 url_db_from_package_citation(dir, meta),
                 url_db_from_package_news(dir))
     if(requireNamespace("xml2", quietly = TRUE)) {
         db <- rbind(db,
                     url_db_from_package_HTML_files(dir),
                     url_db_from_package_README_md(dir),
                     url_db_from_package_NEWS_md(dir)
                     )
     }
     if(add)
         db$Parent <- file.path(basename(dir), db$Parent)
     db
 }

 url_db_from_installed_packages <-
 function(packages, lib.loc = NULL, verbose = FALSE)
 {
     if(!length(packages)) return()
     one <- function(p) {
         if(verbose)
             message(sprintf("processing %s", p))
         dir <- system.file(package = p, lib.loc = lib.loc)
         if(dir == "") return()
         meta <- .read_description(file.path(dir, "DESCRIPTION"))
         rddb <- Rd_db(p, lib.loc = dirname(dir))
         db <- rbind(url_db_from_package_metadata(meta),
                     url_db_from_package_Rd_db(rddb),
                     url_db_from_package_citation(dir, meta,
                                                  installed = TRUE),
                     url_db_from_package_news(dir, installed = TRUE))
         if(requireNamespace("xml2", quietly = TRUE)) {
             db <- rbind(db,
                         url_db_from_package_HTML_files(dir,
                                                        installed = TRUE),
                         url_db_from_package_README_md(dir,
                                                       installed = TRUE),
                         url_db_from_package_NEWS_md(dir,
                                                     installed = TRUE)
                         )
         }
         db$Parent <- file.path(p, db$Parent)
         db
     }
     do.call(rbind,
             c(lapply(packages, one),
               list(make.row.names = FALSE)))
 }

 get_IANA_HTTP_status_code_db <-
 function()
 {
     ## See
     ## <http://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
     baseurl <- "http://www.iana.org/assignments/http-status-codes/"
     db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")),
                           stringsAsFactors = FALSE)
     ## Drop "Unassigned".
     db[db$Description != "Unassigned", ]
 }

 ## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
 ## and <http://tools.ietf.org/html/rfc959>,
 ## Section 4.2.2 "Numeric Order List of Reply Codes",
 ## and <https://tools.ietf.org/html/rfc2228>,
 ## Section 5 "New FTP Replies".
 ## Only need those >= 400.
 table_of_FTP_server_return_codes <-
     c("421" = "Service not available, closing control connection.",
       "425" = "Can't open data connection.",
       "426" = "Connection closed; transfer aborted.",
       "430" = "Invalid username or password",
       "431" = "Need some unavailable resource to process security.",
       "434" = "Requested host unavailable.",
       "450" = "Requested file action not taken.",
       "451" = "Requested action aborted: local error in processing.",
       "452" = "Requested action not taken.  Insufficient storage space in system.",
       "500" = "Syntax error, command unrecognized.",
       "501" = "Syntax error in parameters or arguments.",
       "502" = "Command not implemented.",
       "503" = "Bad sequence of commands.",
       "504" = "Command not implemented for that parameter.",
       "530" = "Not logged in.",
       "532" = "Need account for storing files.",
       "533" = "Command protection level denied for policy reasons.",
       "534" = "Request denied for policy reasons.",
       "535" = "Failed security check (hash, sequence, etc).",
       "536" = "Requested PROT level not supported by mechanism.",
       "537" = "Command protection level not supported by security mechanism.",
       "550" = "Requested action not taken.  File unavailable",
       "551" = "Requested action aborted: page type unknown.",
       "552" = "Requested file action aborted.  Exceeded storage allocation (for current directory or dataset).",
       "553" = "Requested action not taken.  File name not allowed.",
       "631" = "Integrity protected reply.",
       "632" = "Confidentiality and integrity protected reply.",
       "633" = "Confidentiality protected reply."
       )

 check_url_db <-
 function(db, remote = TRUE, verbose = FALSE)
 {
     use_curl <-
         config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_",
                                          "TRUE")) &&
         requireNamespace("curl", quietly = TRUE)

     .gather <- function(u = character(),
                         p = list(),
                         s = rep.int("", length(u)),
                         m = rep.int("", length(u)),
                         new = rep.int("", length(u)),
                         cran = rep.int("", length(u)),
                         spaces = rep.int("", length(u)),
                         R = rep.int("", length(u))) {
         y <- data.frame(URL = u, From = I(p), Status = s, Message = m,
                         New = new, CRAN = cran, Spaces = spaces, R = R,
                         row.names = NULL, stringsAsFactors = FALSE)
         y$From <- p
         class(y) <- c("check_url_db", "data.frame")
         y
     }

     .fetch <- function(u) {
         if(verbose) message(sprintf("processing %s", u))
         h <- tryCatch(curlGetHeaders(u), error = identity)
         if(inherits(h, "error")) {
             ## Currently, this info is only used in .check_http().
             ## Might be useful for checking ftps too, so simply leave it
             ## here instead of moving to .check_http().
             msg <- conditionMessage(h)
             if (grepl("libcurl error code (51|60)", msg)) {
                 h2 <- tryCatch(curlGetHeaders(u, verify = FALSE),
                                error = identity)
                 attr(h, "no-verify") <- h2
             }
         }
         h
     }

     .check_ftp <- function(u) {
         h <- .fetch(u)
         if(inherits(h, "error")) {
             s <- "-1"
             msg <- sub("[[:space:]]*$", "", conditionMessage(h))
         } else {
             s <- as.character(attr(h, "status"))
             msg <- table_of_FTP_server_return_codes[s]
         }
         c(s, msg, "", "")
     }

     .check_http <- if(remote)
         function(u) c(.check_http_A(u), .check_http_B(u))
     else
         function(u) c(rep.int("", 3L), .check_http_B(u))

     .check_http_A <- function(u) {
         h <- .fetch(u)
         newLoc <- ""
         if(inherits(h, "error")) {
             s <- "-1"
             msg <- sub("[[:space:]]*$", "", conditionMessage(h))
             if (!is.null(v <- attr(h, "no-verify"))) {
                 s2 <- as.character(attr(v, "status"))
                 msg <- paste0(msg, "\n\t(Status without verification: ",
                               table_of_HTTP_status_codes[s2], ")")
             }
         } else {
             s <- as.character(attr(h, "status"))
             msg <- table_of_HTTP_status_codes[s]
         }
         ## Look for redirected URLs
         if (any(grepl("301 Moved Permanently", h, useBytes = TRUE))) {
             ind <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
             if (length(ind))
                 newLoc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[max(ind)])
         }
         ##
         if((s != "200") && use_curl) {
             g <- .curl_GET_status(u)
             if(g == "200") {
                 s <- g
                 msg <- "OK"
             }
         }
         ## A mis-configured site
         if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
             s <- "405"
         c(s, msg, newLoc)
     }

     .check_http_B <- function(u) {
         ul <- tolower(u)
         cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
                   !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]]+(html|pdf|rds)$",
                          ul)) ||
                  (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
                         ul)) ||
                  startsWith(ul, "http://cran.r-project.org") ||
                  any(startsWith(ul, mirrors)))
         R <- grepl("^http://(www|bugs|journal).r-project.org", ul)
         spaces <- grepl(" ", u)
         c(if(cran) u else "", if(spaces) u else "", if(R) u else "")
     }

     bad <- .gather()

     if(!NROW(db)) return(bad)

     ## Could also use utils::getCRANmirrors(local.only = TRUE).
     mirrors <- c(utils::read.csv(file.path(R.home("doc"),
                                            "CRAN_mirrors.csv"),
                                  as.is = TRUE, encoding = "UTF-8")$URL,
                  "http://cran.rstudio.com/",
                  "https://cran.rstudio.com/")
     mirrors <- tolower(sub("/$", "", mirrors))

     if(inherits(db, "check_url_db")) {
         ## Allow re-checking check results.
         parents <- db$From
         urls <- db$URL
     } else {
         parents <- split(db$Parent, db$URL)
         urls <- names(parents)
     }

     parts <- parse_URI_reference(urls)

     ## Empty URLs.
     ind <- apply(parts == "", 1L, all)
     if(any(ind)) {
         len <- sum(ind)
         bad <- rbind(bad,
                      .gather(urls[ind],
                              parents[ind],
                              m = rep.int("Empty URL", len)))
     }

     ## Invalid URI schemes.
     schemes <- parts[, 1L]
     ind <- is.na(match(schemes,
                        c("",
                          IANA_URI_scheme_db$URI_Scheme,
                          ## Also allow 'javascript' scheme, see
                          ## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
                          ## (but apparently never registered with IANA).
                          "javascript")))
     if(any(ind)) {
         len <- sum(ind)
         msg <- rep.int("Invalid URI scheme", len)
         doi <- schemes[ind] == "doi"
         if(any(doi))
             msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
         bad <- rbind(bad,
                      .gather(urls[ind], parents[ind], m = msg))
     }

     ## ftp.
     pos <- which(schemes == "ftp")
     if(length(pos) && remote) {
         results <- do.call(rbind, lapply(urls[pos], .check_ftp))
         status <- as.numeric(results[, 1L])
         ind <- (status < 0L) | (status >= 400L)
         if(any(ind)) {
             pos <- pos[ind]
             s <- as.character(status[ind])
             s[s == "-1"] <- "Error"
             m <- results[ind, 2L]
             m[is.na(m)] <- ""
             bad <- rbind(bad,
                          .gather(urls[pos], parents[pos], s, m))
         }
     }

     ## http/https.
     pos <- which(schemes == "http" | schemes == "https")
     if(length(pos)) {
         results <- do.call(rbind, lapply(urls[pos], .check_http))
         status <- as.numeric(results[, 1L])
         ## 405 is HTTP not allowing HEAD requests
         ## maybe also skip 500, 503, 504 as likely to be temporary issues
         ind <- is.na(match(status, c(200L, 405L, NA))) |
             nzchar(results[, 4L]) |
             nzchar(results[, 5L]) |
             nzchar(results[, 6L])
         if(any(ind)) {
             pos <- pos[ind]
             s <- as.character(status[ind])
             s[is.na(s)] <- ""
             s[s == "-1"] <- "Error"
             m <- results[ind, 2L]
             m[is.na(m)] <- ""
             bad <- rbind(bad,
                          .gather(urls[pos], parents[pos], s, m,
                                  results[ind, 3L],
                                  results[ind, 4L],
                                  results[ind, 5L],
                                  results[ind, 6L]))
         }
     }
     bad
 }

 format.check_url_db <-
 function(x, ...)
 {
     if(!NROW(x)) return(character())

     u <- x$URL
     new <- x$New
     ind <- nzchar(new)
     u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])

     paste0(sprintf("URL: %s", u),
            sprintf("\nFrom: %s",
                    vapply(x$From, paste, "", collapse = "\n      ")),
            ifelse((s <- x$Status) == "",
                   "",
                   sprintf("\nStatus: %s", s)),
            ifelse((m <- x$Message) == "",
                   "",
                   sprintf("\nMessage: %s", gsub("\n", "\n  ", m))),
            ifelse((m <- x$Spaces) == "",
                   "",
                   "\nURL contains spaces"),
            ifelse((m <- x$CRAN) == "",
                   "",
                   "\nCRAN URL not in canonical form"),
            ifelse((m <- x$R) == "",
                   "",
                   "\nR-project URL not in canonical form")
            )
 }

 print.check_url_db <-
 function(x, ...)
 {
     if(NROW(x))
         writeLines(paste(format(x), collapse = "\n\n"))
     invisible(x)
 }

 as.matrix.check_url_db <-
 function(x, ...)
 {
     n <- lengths(x[["From"]])
     y <- do.call(cbind,
                  c(list(URL = rep.int(x[["URL"]], n),
                         Parent = unlist(x[["From"]])),
                    lapply(x[-c(1L, 2L)], rep.int, n)))
     rownames(y) <- NULL
     y
 }

 .curl_GET_status <-
 function(u, verbose = FALSE)
 {
     if(verbose)
         message(sprintf("processing %s", u))
     ## Configure curl handle for better luck with JSTOR URLs/DOIs.
     ## Alternatively, special-case requests to
     ##   https?://doi.org/10.2307
     ##   https?://www.jstor.org
     h <- curl::new_handle()
     curl::handle_setopt(h,
                         cookiesession = 1,
                         followlocation = 1,
                         http_version = 2L,
                         ssl_enable_alpn = 0)
     g <- tryCatch(curl::curl_fetch_memory(u, handle = h),
                   error = identity)
     if(inherits(g, "error"))
         -1L
     else
         g$status_code
 }
	# File src/library/tools/R/urltools.R
	# Part of the R package, https://www.R-project.org
	#
	# Copyright (C) 2015-2019 The R Core Team
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# A copy of the GNU General Public License is available at
	# https://www.R-project.org/Licenses/

	get_IANA_URI_scheme_db <-
	function()
	{
	## See
	## <http://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
	baseurl <- "http://www.iana.org/assignments/uri-schemes/"
	db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
	stringsAsFactors = FALSE, encoding = "UTF-8")
	names(db) <- chartr(".", "_", names(db))
	db
	}

	parse_URI_reference <-
	function(x)
	{
	## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>.
	re <- "^(([^:/?#]+):)?(//([^/?#]))?([^?#])(\\?([^#]))?(#(.))?"
	if(length(x)) {
	y <- do.call(rbind, regmatches(x, regexec(re, x)))
	y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
	} else {
	y <- matrix(character(), 0L, 5L)
	}
	colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
	y
	}

	.get_urls_from_Rd <-
	function(x)
	{
	urls <- character()
	recurse <- function(e) {
	tag <- attr(e, "Rd_tag")
	## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
	if(identical(tag, "\\url")) {
	urls <<-
	c(urls, trimws(gsub("\n", "", .Rd_deparse(e, tag = FALSE),
	fixed = TRUE, useBytes = TRUE)))
	} else if(identical(tag, "\\href")) {
	urls <<-
	c(urls, trimws(gsub("\n", "",
	.Rd_deparse(e[[1L]], tag = FALSE),
	fixed = TRUE, useBytes = TRUE)))
	} else if(is.list(e))
	lapply(e, recurse)
	}
	lapply(x, recurse)
	unique(trimws(urls))
	}

	.get_urls_from_HTML_file <-
	function(f)
	{
	doc <- xml2::read_html(f)
	if(!inherits(doc, "xml_node")) return(character())
	nodes <- xml2::xml_find_all(doc, "//a")
	hrefs <- xml2::xml_attr(nodes, "href")
	unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
	}

	.get_urls_from_PDF_file <-
	function(f)
	{
	## Seems there is no straightforward way to extract hyperrefs from a
	## PDF, hence first convert to HTML.
	## Note that pdftohtml always outputs in cwd ...
	owd <- getwd()
	dir.create(d <- tempfile())
	on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
	file.copy(normalizePath(f), d)
	setwd(d)
	g <- tempfile(tmpdir = d, fileext = ".xml")
	system2("pdftohtml",
	c("-s -q -i -c -xml", basename(f), basename(g)))
	## Oh dear: seems that pdftohtml can fail without a non-zero exit
	## status.
	if(file.exists(g))
	.get_urls_from_HTML_file(g)
	else
	character()
	}

	url_db <-
	function(urls, parents)
	{
	## Some people get leading LFs in URLs, so trim before checking.
	db <- data.frame(URL = trimws(as.character(urls)),
	Parent = as.character(parents),
	stringsAsFactors = FALSE)
	class(db) <- c("url_db", "data.frame")
	db
	}

	url_db_from_HTML_files <-
	function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
	{
	urls <- parents <- character()
	if(is.null(files))
	files <- list.files(dir, pattern = "[.]html$",
	full.names = TRUE,
	recursive = recursive)
	urls <-
	lapply(files,
	function(f) {
	if(verbose)
	message(sprintf("processing %s",
	.file_path_relative_to_dir(f, dir)))
	.get_urls_from_HTML_file(f)
	})
	names(urls) <- files
	urls <- Filter(length, urls)
	if(length(urls)) {
	parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
	lengths(urls))
	urls <- unlist(urls, use.names = FALSE)
	}
	url_db(urls, parents)
	}

	url_db_from_PDF_files <-
	function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
	{
	urls <- parents <- character()
	if(is.null(files))
	files <- list.files(dir, pattern = "[.]pdf$",
	full.names = TRUE,
	recursive = recursive)
	## FIXME: this is simpler to do with full.names = FALSE and without
	## tools:::.file_path_relative_to_dir().
	urls <-
	lapply(files,
	function(f) {
	if(verbose)
	message(sprintf("processing %s",
	.file_path_relative_to_dir(f, dir)))
	.get_urls_from_PDF_file(f)
	})
	names(urls) <- files
	urls <- Filter(length, urls)
	if(length(urls)) {
	parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
	lengths(urls))
	urls <- unlist(urls, use.names = FALSE)
	}
	url_db(urls, parents)
	}

	url_db_from_package_Rd_db <-
	function(db)
	{
	urls <- Filter(length, lapply(db, .get_urls_from_Rd))
	url_db(unlist(urls, use.names = FALSE),
	rep.int(file.path("man", names(urls)),
	lengths(urls)))
	}

	url_db_from_package_metadata <-
	function(meta)
	{
	urls <- character()
	fields <- c("URL", "BugReports")
	for(v in meta[fields]) {
	if(is.na(v)) next
	pattern <-
	"<(URL: )?((https?\|ftp)://[^[:space:],])[[:space:]]>"
	m <- gregexpr(pattern, v)
	urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
	regmatches(v, m) <- ""
	pattern <- "(^\|[^>\"])((https?\|ftp)://[^[:space:],]*)"
	m <- gregexpr(pattern, v)
	urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
	}

	url_db(urls, rep.int("DESCRIPTION", length(urls)))
	}

	url_db_from_package_citation <-
	function(dir, meta, installed = FALSE)
	{
	urls <- character()
	path <- if(installed) "CITATION" else file.path("inst", "CITATION")
	cfile <- file.path(dir, path)
	if(file.exists(cfile)) {
	cinfo <- .read_citation_quietly(cfile, meta)
	if(!inherits(cinfo, "error"))
	urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
	}
	url_db(urls, rep.int(path, length(urls)))
	}

	url_db_from_package_news <-
	function(dir, installed = FALSE)
	{
	path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
	nfile <- file.path(dir, path)
	urls <-
	if(file.exists(nfile)) {
	macros <- initialRdMacros()
	.get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros),
	stages = "install"))
	} else character()
	url_db(urls, rep.int(path, length(urls)))
	}

	url_db_from_package_HTML_files <-
	function(dir, installed = FALSE)
	{
	path <- if(installed) "doc" else file.path("inst", "doc")
	files <- Sys.glob(file.path(dir, path, "*.html"))
	if(installed && file.exists(rfile <- file.path(dir, "README.html")))
	files <- c(files, rfile)
	url_db_from_HTML_files(dir, files = files)
	}

	url_db_from_package_README_md <-
	function(dir, installed = FALSE)
	{
	urls <- path <- character()
	rfile <- Filter(file.exists,
	c(if(!installed)
	file.path(dir, "inst", "README.md"),
	file.path(dir, "README.md")))[1L]
	if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
	path <- .file_path_relative_to_dir(rfile, dir)
	tfile <- tempfile("README", fileext = ".html")
	on.exit(unlink(tfile))
	out <- .pandoc_md_for_CRAN(rfile, tfile)
	if(!out$status) {
	urls <- .get_urls_from_HTML_file(tfile)
	}
	}
	url_db(urls, rep.int(path, length(urls)))
	}

	url_db_from_package_NEWS_md <-
	function(dir, installed = FALSE)
	{
	urls <- path <- character()
	nfile <- Filter(file.exists,
	c(if(!installed)
	file.path(dir, "inst", "NEWS.md"),
	file.path(dir, "NEWS.md")))[1L]
	if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
	path <- .file_path_relative_to_dir(nfile, dir)
	tfile <- tempfile("NEWS", fileext = ".html")
	on.exit(unlink(tfile))
	out <- .pandoc_md_for_CRAN(nfile, tfile)
	if(!out$status) {
	urls <- .get_urls_from_HTML_file(tfile)
	}
	}
	url_db(urls, rep.int(path, length(urls)))
	}

	url_db_from_package_sources <-
	function(dir, add = FALSE) {
	meta <- .read_description(file.path(dir, "DESCRIPTION"))
	db <- rbind(url_db_from_package_metadata(meta),
	url_db_from_package_Rd_db(Rd_db(dir = dir)),
	url_db_from_package_citation(dir, meta),
	url_db_from_package_news(dir))
	if(requireNamespace("xml2", quietly = TRUE)) {
	db <- rbind(db,
	url_db_from_package_HTML_files(dir),
	url_db_from_package_README_md(dir),
	url_db_from_package_NEWS_md(dir)
	)
	}
	if(add)
	db$Parent <- file.path(basename(dir), db$Parent)
	db
	}

	url_db_from_installed_packages <-
	function(packages, lib.loc = NULL, verbose = FALSE)
	{
	if(!length(packages)) return()
	one <- function(p) {
	if(verbose)
	message(sprintf("processing %s", p))
	dir <- system.file(package = p, lib.loc = lib.loc)
	if(dir == "") return()
	meta <- .read_description(file.path(dir, "DESCRIPTION"))
	rddb <- Rd_db(p, lib.loc = dirname(dir))
	db <- rbind(url_db_from_package_metadata(meta),
	url_db_from_package_Rd_db(rddb),
	url_db_from_package_citation(dir, meta,
	installed = TRUE),
	url_db_from_package_news(dir, installed = TRUE))
	if(requireNamespace("xml2", quietly = TRUE)) {
	db <- rbind(db,
	url_db_from_package_HTML_files(dir,
	installed = TRUE),
	url_db_from_package_README_md(dir,
	installed = TRUE),
	url_db_from_package_NEWS_md(dir,
	installed = TRUE)
	)
	}
	db$Parent <- file.path(p, db$Parent)
	db
	}
	do.call(rbind,
	c(lapply(packages, one),
	list(make.row.names = FALSE)))
	}

	get_IANA_HTTP_status_code_db <-
	function()
	{
	## See
	## <http://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
	baseurl <- "http://www.iana.org/assignments/http-status-codes/"
	db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")),
	stringsAsFactors = FALSE)
	## Drop "Unassigned".
	db[db$Description != "Unassigned", ]
	}

	## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
	## and <http://tools.ietf.org/html/rfc959>,
	## Section 4.2.2 "Numeric Order List of Reply Codes",
	## and <https://tools.ietf.org/html/rfc2228>,
	## Section 5 "New FTP Replies".
	## Only need those >= 400.
	table_of_FTP_server_return_codes <-
	c("421" = "Service not available, closing control connection.",
	"425" = "Can't open data connection.",
	"426" = "Connection closed; transfer aborted.",
	"430" = "Invalid username or password",
	"431" = "Need some unavailable resource to process security.",
	"434" = "Requested host unavailable.",
	"450" = "Requested file action not taken.",
	"451" = "Requested action aborted: local error in processing.",
	"452" = "Requested action not taken. Insufficient storage space in system.",
	"500" = "Syntax error, command unrecognized.",
	"501" = "Syntax error in parameters or arguments.",
	"502" = "Command not implemented.",
	"503" = "Bad sequence of commands.",
	"504" = "Command not implemented for that parameter.",
	"530" = "Not logged in.",
	"532" = "Need account for storing files.",
	"533" = "Command protection level denied for policy reasons.",
	"534" = "Request denied for policy reasons.",
	"535" = "Failed security check (hash, sequence, etc).",
	"536" = "Requested PROT level not supported by mechanism.",
	"537" = "Command protection level not supported by security mechanism.",
	"550" = "Requested action not taken. File unavailable",
	"551" = "Requested action aborted: page type unknown.",
	"552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).",
	"553" = "Requested action not taken. File name not allowed.",
	"631" = "Integrity protected reply.",
	"632" = "Confidentiality and integrity protected reply.",
	"633" = "Confidentiality protected reply."
	)

	check_url_db <-
	function(db, remote = TRUE, verbose = FALSE)
	{
	use_curl <-
	config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_",
	"TRUE")) &&
	requireNamespace("curl", quietly = TRUE)

	.gather <- function(u = character(),
	p = list(),
	s = rep.int("", length(u)),
	m = rep.int("", length(u)),
	new = rep.int("", length(u)),
	cran = rep.int("", length(u)),
	spaces = rep.int("", length(u)),
	R = rep.int("", length(u))) {
	y <- data.frame(URL = u, From = I(p), Status = s, Message = m,
	New = new, CRAN = cran, Spaces = spaces, R = R,
	row.names = NULL, stringsAsFactors = FALSE)
	y$From <- p
	class(y) <- c("check_url_db", "data.frame")
	y
	}

	.fetch <- function(u) {
	if(verbose) message(sprintf("processing %s", u))
	h <- tryCatch(curlGetHeaders(u), error = identity)
	if(inherits(h, "error")) {
	## Currently, this info is only used in .check_http().
	## Might be useful for checking ftps too, so simply leave it
	## here instead of moving to .check_http().
	msg <- conditionMessage(h)
	if (grepl("libcurl error code (51\|60)", msg)) {
	h2 <- tryCatch(curlGetHeaders(u, verify = FALSE),
	error = identity)
	attr(h, "no-verify") <- h2
	}
	}
	h
	}

	.check_ftp <- function(u) {
	h <- .fetch(u)
	if(inherits(h, "error")) {
	s <- "-1"
	msg <- sub("[[:space:]]*$", "", conditionMessage(h))
	} else {
	s <- as.character(attr(h, "status"))
	msg <- table_of_FTP_server_return_codes[s]
	}
	c(s, msg, "", "")
	}

	.check_http <- if(remote)
	function(u) c(.check_http_A(u), .check_http_B(u))
	else
	function(u) c(rep.int("", 3L), .check_http_B(u))

	.check_http_A <- function(u) {
	h <- .fetch(u)
	newLoc <- ""
	if(inherits(h, "error")) {
	s <- "-1"
	msg <- sub("[[:space:]]*$", "", conditionMessage(h))
	if (!is.null(v <- attr(h, "no-verify"))) {
	s2 <- as.character(attr(v, "status"))
	msg <- paste0(msg, "\n\t(Status without verification: ",
	table_of_HTTP_status_codes[s2], ")")
	}
	} else {
	s <- as.character(attr(h, "status"))
	msg <- table_of_HTTP_status_codes[s]
	}
	## Look for redirected URLs
	if (any(grepl("301 Moved Permanently", h, useBytes = TRUE))) {
	ind <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
	if (length(ind))
	newLoc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[max(ind)])
	}
	##
	if((s != "200") && use_curl) {
	g <- .curl_GET_status(u)
	if(g == "200") {
	s <- g
	msg <- "OK"
	}
	}
	## A mis-configured site
	if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
	s <- "405"
	c(s, msg, newLoc)
	}

	.check_http_B <- function(u) {
	ul <- tolower(u)
	cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
	!grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]]+(html\|pdf\|rds)$",
	ul)) \|\|
	(grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
	ul)) \|\|
	startsWith(ul, "http://cran.r-project.org") \|\|
	any(startsWith(ul, mirrors)))
	R <- grepl("^http://(www\|bugs\|journal).r-project.org", ul)
	spaces <- grepl(" ", u)
	c(if(cran) u else "", if(spaces) u else "", if(R) u else "")
	}

	bad <- .gather()

	if(!NROW(db)) return(bad)

	## Could also use utils::getCRANmirrors(local.only = TRUE).
	mirrors <- c(utils::read.csv(file.path(R.home("doc"),
	"CRAN_mirrors.csv"),
	as.is = TRUE, encoding = "UTF-8")$URL,
	"http://cran.rstudio.com/",
	"https://cran.rstudio.com/")
	mirrors <- tolower(sub("/$", "", mirrors))

	if(inherits(db, "check_url_db")) {
	## Allow re-checking check results.
	parents <- db$From
	urls <- db$URL
	} else {
	parents <- split(db$Parent, db$URL)
	urls <- names(parents)
	}

	parts <- parse_URI_reference(urls)

	## Empty URLs.
	ind <- apply(parts == "", 1L, all)
	if(any(ind)) {
	len <- sum(ind)
	bad <- rbind(bad,
	.gather(urls[ind],
	parents[ind],
	m = rep.int("Empty URL", len)))
	}

	## Invalid URI schemes.
	schemes <- parts[, 1L]
	ind <- is.na(match(schemes,
	c("",
	IANA_URI_scheme_db$URI_Scheme,
	## Also allow 'javascript' scheme, see
	## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
	## (but apparently never registered with IANA).
	"javascript")))
	if(any(ind)) {
	len <- sum(ind)
	msg <- rep.int("Invalid URI scheme", len)
	doi <- schemes[ind] == "doi"
	if(any(doi))
	msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
	bad <- rbind(bad,
	.gather(urls[ind], parents[ind], m = msg))
	}

	## ftp.
	pos <- which(schemes == "ftp")
	if(length(pos) && remote) {
	results <- do.call(rbind, lapply(urls[pos], .check_ftp))
	status <- as.numeric(results[, 1L])
	ind <- (status < 0L) \| (status >= 400L)
	if(any(ind)) {
	pos <- pos[ind]
	s <- as.character(status[ind])
	s[s == "-1"] <- "Error"
	m <- results[ind, 2L]
	m[is.na(m)] <- ""
	bad <- rbind(bad,
	.gather(urls[pos], parents[pos], s, m))
	}
	}

	## http/https.
	pos <- which(schemes == "http" \| schemes == "https")
	if(length(pos)) {
	results <- do.call(rbind, lapply(urls[pos], .check_http))
	status <- as.numeric(results[, 1L])
	## 405 is HTTP not allowing HEAD requests
	## maybe also skip 500, 503, 504 as likely to be temporary issues
	ind <- is.na(match(status, c(200L, 405L, NA))) \|
	nzchar(results[, 4L]) \|
	nzchar(results[, 5L]) \|
	nzchar(results[, 6L])
	if(any(ind)) {
	pos <- pos[ind]
	s <- as.character(status[ind])
	s[is.na(s)] <- ""
	s[s == "-1"] <- "Error"
	m <- results[ind, 2L]
	m[is.na(m)] <- ""
	bad <- rbind(bad,
	.gather(urls[pos], parents[pos], s, m,
	results[ind, 3L],
	results[ind, 4L],
	results[ind, 5L],
	results[ind, 6L]))
	}
	}
	bad
	}

	format.check_url_db <-
	function(x, ...)
	{
	if(!NROW(x)) return(character())

	u <- x$URL
	new <- x$New
	ind <- nzchar(new)
	u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])

	paste0(sprintf("URL: %s", u),
	sprintf("\nFrom: %s",
	vapply(x$From, paste, "", collapse = "\n ")),
	ifelse((s <- x$Status) == "",
	"",
	sprintf("\nStatus: %s", s)),
	ifelse((m <- x$Message) == "",
	"",
	sprintf("\nMessage: %s", gsub("\n", "\n ", m))),
	ifelse((m <- x$Spaces) == "",
	"",
	"\nURL contains spaces"),
	ifelse((m <- x$CRAN) == "",
	"",
	"\nCRAN URL not in canonical form"),
	ifelse((m <- x$R) == "",
	"",
	"\nR-project URL not in canonical form")
	)
	}

	print.check_url_db <-
	function(x, ...)
	{
	if(NROW(x))
	writeLines(paste(format(x), collapse = "\n\n"))
	invisible(x)
	}

	as.matrix.check_url_db <-
	function(x, ...)
	{
	n <- lengths(x[["From"]])
	y <- do.call(cbind,
	c(list(URL = rep.int(x[["URL"]], n),
	Parent = unlist(x[["From"]])),
	lapply(x[-c(1L, 2L)], rep.int, n)))
	rownames(y) <- NULL
	y
	}

	.curl_GET_status <-
	function(u, verbose = FALSE)
	{
	if(verbose)
	message(sprintf("processing %s", u))
	## Configure curl handle for better luck with JSTOR URLs/DOIs.
	## Alternatively, special-case requests to
	## https?://doi.org/10.2307
	## https?://www.jstor.org
	h <- curl::new_handle()
	curl::handle_setopt(h,
	cookiesession = 1,
	followlocation = 1,
	http_version = 2L,
	ssl_enable_alpn = 0)
	g <- tryCatch(curl::curl_fetch_memory(u, handle = h),
	error = identity)
	if(inherits(g, "error"))
	-1L
	else
	g$status_code
	}