blob: 47c95be0cdbaae91da3369095d52005b1c68cc87 [file] [log] [blame]
// newgtlds is a utility command that downloads the list of gTLDs from ICANN
// and formats it into the PSL format, writing to stdout.
package main
import (
"bytes"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net/http"
"os"
"strings"
"text/template"
"time"
)
const (
// ICANN_GTLD_JSON_URL is the URL for the ICANN gTLD JSON registry (version
// 2). See https://www.icann.org/resources/pages/registries/registries-en for
// more information.
ICANN_GTLD_JSON_URL = "https://www.icann.org/resources/registries/gtlds/v2/gtlds.json"
// IANA_TLDS_TXT_URL is the URL for the IANA "Public Suffix List" of TLDs
// in the ICP-3 Root - including new ccTLDs, EBRERO gTLDS or things not in
// the JSON File above that should be included in the PSL. Note: UPPERCASE
IANA_TLDS_TXT_URL = "http://data.iana.org/TLD/tlds-alpha-by-domain.txt"
// PSL_GTLDS_SECTION_HEADER marks the start of the newGTLDs section of the
// overall public suffix dat file.
PSL_GTLDS_SECTION_HEADER = "// newGTLDs"
// PSL_GTLDS_SECTION_FOOTER marks the end of the newGTLDs section of the
// overall public suffix dat file.
PSL_GTLDS_SECTION_FOOTER = "// ===END ICANN DOMAINS==="
)
var (
// legacyGTLDs are gTLDs that predate ICANN's new gTLD program. These legacy
// gTLDs are present in the ICANN_GTLD_JSON_URL data but we do not want to
// include them in the new gTLD section of the PSL data because it will create
// duplicates with existing entries alongside registry-reserved second level
// domains present in the PSL data. Entries present in legacyGTLDs will not be
// output by this tool when generating the new gTLD data.
legacyGTLDs = map[string]bool{
"aero": true,
"asia": true,
"biz": true,
"cat": true,
"com": true,
"coop": true,
"info": true,
"jobs": true,
"mobi": true,
"museum": true,
"name": true,
"net": true,
"org": true,
"post": true,
"pro": true,
"tel": true,
"xxx": true,
}
// pslHeaderTemplate is a parsed text/template instance for rendering the header
// before the data rendered with the pslTemplate. We use two separate templates
// so that we can avoid having a variable date stamp in the pslTemplate, allowing
// us to easily check that the data in the current .dat file is unchanged from
// what we render when there are no updates to add.
//
// Expected template data:
// URL - the string URL that the data was fetched from.
// Date - the time.Date that the data was fetched.
// DateFormat - the format string to use with the date.
pslHeaderTemplate = template.Must(template.New("public-suffix-list-gtlds-header").Parse(`
// List of new gTLDs imported from {{ .URL }} on {{ .Date.Format .DateFormat }}
// This list is auto-generated, don't edit it manually.`))
// pslTemplate is a parsed text/template instance for rendering a list of pslEntry
// objects in the format used by the public suffix list.
//
// It expects the following template data:
// Entries - a list of pslEntry objects.
pslTemplate = template.Must(
template.New("public-suffix-list-gtlds").Parse(`
{{- range .Entries }}
{{- .Comment }}
{{ printf "%s\n" .ULabel }}
{{ end }}`))
)
// pslEntry is a struct matching a subset of the gTLD data fields present in
// each object entry of the "GLTDs" array from ICANN_GTLD_JSON_URL.
type pslEntry struct {
// ALabel contains the ASCII gTLD name. For internationalized gTLDs the GTLD
// field is expressed in punycode.
ALabel string `json:"gTLD"`
// ULabel contains the unicode representation of the gTLD name. When the gTLD
// ULabel in the ICANN gTLD data is empty (e.g for an ASCII gTLD like
// '.pizza') the PSL entry will use the ALabel as the ULabel.
ULabel string
// RegistryOperator holds the name of the registry operator that operates the
// gTLD (may be empty).
RegistryOperator string
// DateOfContractSignature holds the date the gTLD contract was signed (may be empty).
DateOfContractSignature string
// ContractTerminated indicates whether the contract has been terminated by
// ICANN. When rendered by the pslTemplate only entries with
// ContractTerminated = false are included.
ContractTerminated bool
// RemovalDate indicates the date the gTLD delegation was removed from the
// root zones.
RemovalDate string
}
// normalize will normalize a pslEntry by mutating it in place to trim the
// string fields of whitespace and by populating the ULabel with the ALabel if
// the ULabel is empty.
func (e *pslEntry) normalize() {
e.ALabel = strings.TrimSpace(e.ALabel)
e.ULabel = strings.TrimSpace(e.ULabel)
e.RegistryOperator = strings.TrimSpace(e.RegistryOperator)
e.DateOfContractSignature = strings.TrimSpace(e.DateOfContractSignature)
// If there is no explicit uLabel use the gTLD as the uLabel.
if e.ULabel == "" {
e.ULabel = e.ALabel
}
}
// Comment generates a comment string for the pslEntry. This string has a `//`
// prefix and matches one of the following two forms.
//
// If the registry operator field is empty the comment will be of the form:
//
// '// <ALabel> : <DateOfContractSignature>'
//
// If the registry operator field is not empty the comment will be of the form:
//
// '// <ALabel> : <DateOfContractSignature> <RegistryOperator>'
//
// In both cases the <DateOfContractSignature> may be empty.
func (e pslEntry) Comment() string {
parts := []string{
"//",
e.ALabel,
":",
e.DateOfContractSignature,
}
// Avoid two trailing spaces if registry operator is empty
if e.RegistryOperator != "" {
parts = append(parts, e.RegistryOperator)
}
return strings.Join(parts, " ")
}
// gTLDDatSpan represents the span between the PSL_GTLD_SECTION_HEADER and
// the PSL_GTLDS_SECTION_FOOTER in the PSL dat file.
type gTLDDatSpan struct {
startIndex int
endIndex int
}
var (
errNoHeader = fmt.Errorf("did not find expected header line %q",
PSL_GTLDS_SECTION_HEADER)
errMultipleHeaders = fmt.Errorf("found expected header line %q more than once",
PSL_GTLDS_SECTION_HEADER)
errNoFooter = fmt.Errorf("did not find expected footer line %q",
PSL_GTLDS_SECTION_FOOTER)
)
type errInvertedSpan struct {
span gTLDDatSpan
}
func (e errInvertedSpan) Error() string {
return fmt.Sprintf(
"found footer line %q before header line %q (index %d vs %d)",
PSL_GTLDS_SECTION_FOOTER, PSL_GTLDS_SECTION_HEADER,
e.span.endIndex, e.span.startIndex)
}
// validate checks that a given gTLDDatSpan is sensible. It returns an err if
// the span is nil, if the start or end index haven't been set to > 0, or if the
// end index is <= the the start index.
func (s gTLDDatSpan) validate() error {
if s.startIndex <= 0 {
return errNoHeader
}
if s.endIndex <= 0 {
return errNoFooter
}
if s.endIndex <= s.startIndex {
return errInvertedSpan{span: s}
}
return nil
}
// datFile holds the individual lines read from the public suffix list dat file and
// the span that holds the gTLD specific data section. It supports reading the
// gTLD specific data, and replacing it.
type datFile struct {
// lines holds the datfile contents split by "\n"
lines []string
// gTLDSpan holds the indexes where the gTLD data can be found in lines.
gTLDSpan gTLDDatSpan
}
type errSpanOutOfBounds struct {
span gTLDDatSpan
numLines int
}
func (e errSpanOutOfBounds) Error() string {
return fmt.Sprintf(
"span out of bounds: start index %d, end index %d, number of lines %d",
e.span.startIndex, e.span.endIndex, e.numLines)
}
// validate validates the state of the datFile. It returns an error if
// the gTLD span validate() returns an error, or if gTLD span endIndex is >= the
// number of lines in the file.
func (d datFile) validate() error {
if err := d.gTLDSpan.validate(); err != nil {
return err
}
if d.gTLDSpan.endIndex >= len(d.lines) {
return errSpanOutOfBounds{span: d.gTLDSpan, numLines: len(d.lines)}
}
return nil
}
// getGTLDLines returns the lines from the dat file within the gTLD data span,
// or an error if the span isn't valid for the dat file.
func (d datFile) getGTLDLines() ([]string, error) {
if err := d.validate(); err != nil {
return nil, err
}
return d.lines[d.gTLDSpan.startIndex:d.gTLDSpan.endIndex], nil
}
// ReplaceGTLDContent updates the dat file's lines to replace the gTLD data span
// with new content.
func (d *datFile) ReplaceGTLDContent(content string) error {
if err := d.validate(); err != nil {
return err
}
contentLines := strings.Split(content, "\n")
beforeLines := d.lines[0:d.gTLDSpan.startIndex]
afterLines := d.lines[d.gTLDSpan.endIndex:]
newLines := append(beforeLines, append(contentLines, afterLines...)...)
// Update the span based on the new content length
d.gTLDSpan.endIndex = len(beforeLines) + len(contentLines)
// and update the data file lines
d.lines = newLines
return nil
}
// String returns the dat file's lines joined together.
func (d datFile) String() string {
return strings.Join(d.lines, "\n")
}
// readDatFile reads the contents of the PSL dat file from the provided path
// and returns a representation holding all of the lines and the span where the gTLD
// data is found within the dat file. An error is returned if the file can't be read
// or if the gTLD data span can't be found or is invalid.
func readDatFile(datFilePath string) (*datFile, error) {
pslDatBytes, err := os.ReadFile(datFilePath)
if err != nil {
return nil, err
}
return readDatFileContent(string(pslDatBytes))
}
func readDatFileContent(pslData string) (*datFile, error) {
pslDatLines := strings.Split(pslData, "\n")
headerIndex, footerIndex := 0, 0
for i := 0; i < len(pslDatLines); i++ {
line := pslDatLines[i]
if line == PSL_GTLDS_SECTION_HEADER && headerIndex == 0 {
// If the line matches the header and we haven't seen the header yet, capture
// the index
headerIndex = i
} else if line == PSL_GTLDS_SECTION_HEADER && headerIndex != 0 {
// If the line matches the header and we've already seen the header return
// an error. This is unexpected.
return nil, errMultipleHeaders
} else if line == PSL_GTLDS_SECTION_FOOTER && footerIndex == 0 {
// If the line matches the footer, capture the index. We don't need
// to consider the case where we've already seen a footer because we break
// below when we have both a header and footer index.
footerIndex = i
}
// Break when we have found one header and one footer.
if headerIndex != 0 && footerIndex != 0 {
break
}
}
if headerIndex == 0 {
return nil, errNoHeader
} else if footerIndex == 0 {
return nil, errNoFooter
}
datFile := &datFile{
lines: pslDatLines,
gTLDSpan: gTLDDatSpan{
startIndex: headerIndex + 1,
endIndex: footerIndex,
},
}
if err := datFile.validate(); err != nil {
return nil, err
}
return datFile, nil
}
// getData performs a HTTP GET request to the given URL and returns the
// response body bytes or returns an error. An HTTP response code other than
// http.StatusOK (200) is considered to be an error.
func getData(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code fetching data "+
"from %q : expected status %d got %d",
url, http.StatusOK, resp.StatusCode)
}
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
return respBody, nil
}
// filterGTLDs removes entries that are present in the legacyGTLDs map or have
// ContractTerminated equal to true, or a non-empty RemovalDate.
func filterGTLDs(entries []*pslEntry) []*pslEntry {
var filtered []*pslEntry
for _, entry := range entries {
if _, isLegacy := legacyGTLDs[entry.ALabel]; isLegacy {
continue
}
if entry.ContractTerminated {
continue
}
if entry.RemovalDate != "" {
continue
}
filtered = append(filtered, entry)
}
return filtered
}
// getPSLEntries fetches a list of pslEntry objects (or returns an error) by:
// 1. getting the raw JSON data from the provided url string.
// 2. unmarshaling the JSON data to create pslEntry objects.
// 3. normalizing the pslEntry objects.
// 4. filtering out any legacy or contract terminated gTLDs
//
// If there are no pslEntry objects after unmarshaling the data in step 2 or
// filtering the gTLDs in step 4 it is considered an error condition.
func getPSLEntries(url string) ([]*pslEntry, error) {
respBody, err := getData(url)
if err != nil {
return nil, err
}
var results struct {
GTLDs []*pslEntry
}
if err := json.Unmarshal(respBody, &results); err != nil {
return nil, fmt.Errorf(
"unmarshaling ICANN gTLD JSON data: %v", err)
}
// We expect there to always be GTLD data. If there was none after unmarshaling
// then its likely the data format has changed or something else has gone wrong.
if len(results.GTLDs) == 0 {
return nil, errors.New("found no gTLD information after unmarshaling")
}
// Normalize each tldEntry. This will remove leading/trailing whitespace and
// populate the ULabel with the ALabel if the entry has no ULabel.
for _, tldEntry := range results.GTLDs {
tldEntry.normalize()
}
filtered := filterGTLDs(results.GTLDs)
if len(filtered) == 0 {
return nil, errors.New(
"found no gTLD information after removing legacy and contract terminated gTLDs")
}
return filtered, nil
}
// renderTemplate renders the given template to the provided writer, using the
// templateData, or returns an error.
func renderTemplate(writer io.Writer, template *template.Template, templateData interface{}) error {
var buf bytes.Buffer
if err := template.Execute(&buf, templateData); err != nil {
return err
}
_, err := writer.Write(buf.Bytes())
if err != nil {
return err
}
return nil
}
// clock is a small interface that lets us mock time in unit tests.
type clock interface {
Now() time.Time
}
// realClock is an implementation of clock that uses time.Now() natively.
type realClock struct{}
// Now returns the current time.Time using the system clock.
func (c realClock) Now() time.Time {
return time.Now()
}
// renderHeader renders the pslHeaderTemplate to the writer or returns an error. The
// provided clock instance is used for the header last update timestamp. If no
// clk instance is provided realClock is used.
func renderHeader(writer io.Writer, clk clock) error {
if clk == nil {
clk = &realClock{}
}
templateData := struct {
URL string
Date time.Time
DateFormat string
}{
URL: ICANN_GTLD_JSON_URL,
Date: clk.Now().UTC(),
DateFormat: time.RFC3339,
}
return renderTemplate(writer, pslHeaderTemplate, templateData)
}
// renderData renders the given list of pslEntry objects using the pslTemplate.
// The rendered template data is written to the provided writer or an error is
// returned.
func renderData(writer io.Writer, entries []*pslEntry) error {
templateData := struct {
Entries []*pslEntry
}{
Entries: entries,
}
return renderTemplate(writer, pslTemplate, templateData)
}
// Process handles updating a datFile with new gTLD content. If there are no
// gTLD updates the existing dat file's contents will be returned. If there are
// updates, the new updates will be spliced into place and the updated file contents
// returned.
func process(datFile *datFile, dataURL string, clk clock) (string, error) {
// Get the lines for the gTLD data span - this includes both the header with the
// date and the actual gTLD entries.
spanLines, err := datFile.getGTLDLines()
if err != nil {
return "", err
}
// Render a new header for the gTLD data.
var newHeaderBuf strings.Builder
if err := renderHeader(&newHeaderBuf, clk); err != nil {
return "", err
}
// Figure out how many lines the header with the dynamic date is.
newHeaderLines := strings.Split(newHeaderBuf.String(), "\n")
headerLen := len(newHeaderLines)
// We should have at least that many lines in the existing span data.
if len(spanLines) <= headerLen {
return "", errors.New("gtld span data was too small, missing header?")
}
// The gTLD data can be found by skipping the header lines
existingData := strings.Join(spanLines[headerLen:], "\n")
// Fetch new PSL entries.
entries, err := getPSLEntries(dataURL)
if err != nil {
return "", err
}
// Render the new gTLD PSL section with the new entries.
var newDataBuf strings.Builder
if err := renderData(&newDataBuf, entries); err != nil {
return "", err
}
// If the newly rendered data doesn't match the existing data then we want to
// update the dat file content by replacing the old span with the new content.
if newDataBuf.String() != existingData {
newContent := newHeaderBuf.String() + "\n" + newDataBuf.String()
if err := datFile.ReplaceGTLDContent(newContent); err != nil {
return "", err
}
}
return datFile.String(), nil
}
func main() {
ifErrQuit := func(err error) {
if err != nil {
fmt.Fprintf(os.Stderr, "error updating gTLD data: %v\n", err)
os.Exit(1)
}
}
pslDatFile := flag.String(
"psl-dat-file",
"public_suffix_list.dat",
"file path to the public_suffix_list.dat data file to be updated with new gTLDs")
overwrite := flag.Bool(
"overwrite",
false,
"overwrite -psl-dat-file with the new data instead of printing to stdout")
// Parse CLI flags.
flag.Parse()
// Read the existing file content and find the span that contains the gTLD data.
datFile, err := readDatFile(*pslDatFile)
ifErrQuit(err)
// Process the dat file.
content, err := process(datFile, ICANN_GTLD_JSON_URL, nil)
ifErrQuit(err)
// If we're not overwriting the file, print the content to stdout.
if !*overwrite {
fmt.Println(content)
os.Exit(0)
}
// Otherwise print nothing to stdout and write the content over the exiting
// pslDatFile path we read earlier.
err = os.WriteFile(*pslDatFile, []byte(content), 0644)
ifErrQuit(err)
}