blob: 47c95be0cdbaae91da3369095d52005b1c68cc87 [file] [log] [blame]
// newgtlds is a utility command that downloads the list of gTLDs from ICANN
// and formats it into the PSL format, writing to stdout.
package main
import (
const (
// ICANN_GTLD_JSON_URL is the URL for the ICANN gTLD JSON registry (version
// 2). See for
// more information.
// IANA_TLDS_TXT_URL is the URL for the IANA "Public Suffix List" of TLDs
// in the ICP-3 Root - including new ccTLDs, EBRERO gTLDS or things not in
// the JSON File above that should be included in the PSL. Note: UPPERCASE
// PSL_GTLDS_SECTION_HEADER marks the start of the newGTLDs section of the
// overall public suffix dat file.
// PSL_GTLDS_SECTION_FOOTER marks the end of the newGTLDs section of the
// overall public suffix dat file.
var (
// legacyGTLDs are gTLDs that predate ICANN's new gTLD program. These legacy
// gTLDs are present in the ICANN_GTLD_JSON_URL data but we do not want to
// include them in the new gTLD section of the PSL data because it will create
// duplicates with existing entries alongside registry-reserved second level
// domains present in the PSL data. Entries present in legacyGTLDs will not be
// output by this tool when generating the new gTLD data.
legacyGTLDs = map[string]bool{
"aero": true,
"asia": true,
"biz": true,
"cat": true,
"com": true,
"coop": true,
"info": true,
"jobs": true,
"mobi": true,
"museum": true,
"name": true,
"net": true,
"org": true,
"post": true,
"pro": true,
"tel": true,
"xxx": true,
// pslHeaderTemplate is a parsed text/template instance for rendering the header
// before the data rendered with the pslTemplate. We use two separate templates
// so that we can avoid having a variable date stamp in the pslTemplate, allowing
// us to easily check that the data in the current .dat file is unchanged from
// what we render when there are no updates to add.
// Expected template data:
// URL - the string URL that the data was fetched from.
// Date - the time.Date that the data was fetched.
// DateFormat - the format string to use with the date.
pslHeaderTemplate = template.Must(template.New("public-suffix-list-gtlds-header").Parse(`
// List of new gTLDs imported from {{ .URL }} on {{ .Date.Format .DateFormat }}
// This list is auto-generated, don't edit it manually.`))
// pslTemplate is a parsed text/template instance for rendering a list of pslEntry
// objects in the format used by the public suffix list.
// It expects the following template data:
// Entries - a list of pslEntry objects.
pslTemplate = template.Must(
{{- range .Entries }}
{{- .Comment }}
{{ printf "%s\n" .ULabel }}
{{ end }}`))
// pslEntry is a struct matching a subset of the gTLD data fields present in
// each object entry of the "GLTDs" array from ICANN_GTLD_JSON_URL.
type pslEntry struct {
// ALabel contains the ASCII gTLD name. For internationalized gTLDs the GTLD
// field is expressed in punycode.
ALabel string `json:"gTLD"`
// ULabel contains the unicode representation of the gTLD name. When the gTLD
// ULabel in the ICANN gTLD data is empty (e.g for an ASCII gTLD like
// '.pizza') the PSL entry will use the ALabel as the ULabel.
ULabel string
// RegistryOperator holds the name of the registry operator that operates the
// gTLD (may be empty).
RegistryOperator string
// DateOfContractSignature holds the date the gTLD contract was signed (may be empty).
DateOfContractSignature string
// ContractTerminated indicates whether the contract has been terminated by
// ICANN. When rendered by the pslTemplate only entries with
// ContractTerminated = false are included.
ContractTerminated bool
// RemovalDate indicates the date the gTLD delegation was removed from the
// root zones.
RemovalDate string
// normalize will normalize a pslEntry by mutating it in place to trim the
// string fields of whitespace and by populating the ULabel with the ALabel if
// the ULabel is empty.
func (e *pslEntry) normalize() {
e.ALabel = strings.TrimSpace(e.ALabel)
e.ULabel = strings.TrimSpace(e.ULabel)
e.RegistryOperator = strings.TrimSpace(e.RegistryOperator)
e.DateOfContractSignature = strings.TrimSpace(e.DateOfContractSignature)
// If there is no explicit uLabel use the gTLD as the uLabel.
if e.ULabel == "" {
e.ULabel = e.ALabel
// Comment generates a comment string for the pslEntry. This string has a `//`
// prefix and matches one of the following two forms.
// If the registry operator field is empty the comment will be of the form:
// '// <ALabel> : <DateOfContractSignature>'
// If the registry operator field is not empty the comment will be of the form:
// '// <ALabel> : <DateOfContractSignature> <RegistryOperator>'
// In both cases the <DateOfContractSignature> may be empty.
func (e pslEntry) Comment() string {
parts := []string{
// Avoid two trailing spaces if registry operator is empty
if e.RegistryOperator != "" {
parts = append(parts, e.RegistryOperator)
return strings.Join(parts, " ")
// gTLDDatSpan represents the span between the PSL_GTLD_SECTION_HEADER and
// the PSL_GTLDS_SECTION_FOOTER in the PSL dat file.
type gTLDDatSpan struct {
startIndex int
endIndex int
var (
errNoHeader = fmt.Errorf("did not find expected header line %q",
errMultipleHeaders = fmt.Errorf("found expected header line %q more than once",
errNoFooter = fmt.Errorf("did not find expected footer line %q",
type errInvertedSpan struct {
span gTLDDatSpan
func (e errInvertedSpan) Error() string {
return fmt.Sprintf(
"found footer line %q before header line %q (index %d vs %d)",
e.span.endIndex, e.span.startIndex)
// validate checks that a given gTLDDatSpan is sensible. It returns an err if
// the span is nil, if the start or end index haven't been set to > 0, or if the
// end index is <= the the start index.
func (s gTLDDatSpan) validate() error {
if s.startIndex <= 0 {
return errNoHeader
if s.endIndex <= 0 {
return errNoFooter
if s.endIndex <= s.startIndex {
return errInvertedSpan{span: s}
return nil
// datFile holds the individual lines read from the public suffix list dat file and
// the span that holds the gTLD specific data section. It supports reading the
// gTLD specific data, and replacing it.
type datFile struct {
// lines holds the datfile contents split by "\n"
lines []string
// gTLDSpan holds the indexes where the gTLD data can be found in lines.
gTLDSpan gTLDDatSpan
type errSpanOutOfBounds struct {
span gTLDDatSpan
numLines int
func (e errSpanOutOfBounds) Error() string {
return fmt.Sprintf(
"span out of bounds: start index %d, end index %d, number of lines %d",
e.span.startIndex, e.span.endIndex, e.numLines)
// validate validates the state of the datFile. It returns an error if
// the gTLD span validate() returns an error, or if gTLD span endIndex is >= the
// number of lines in the file.
func (d datFile) validate() error {
if err := d.gTLDSpan.validate(); err != nil {
return err
if d.gTLDSpan.endIndex >= len(d.lines) {
return errSpanOutOfBounds{span: d.gTLDSpan, numLines: len(d.lines)}
return nil
// getGTLDLines returns the lines from the dat file within the gTLD data span,
// or an error if the span isn't valid for the dat file.
func (d datFile) getGTLDLines() ([]string, error) {
if err := d.validate(); err != nil {
return nil, err
return d.lines[d.gTLDSpan.startIndex:d.gTLDSpan.endIndex], nil
// ReplaceGTLDContent updates the dat file's lines to replace the gTLD data span
// with new content.
func (d *datFile) ReplaceGTLDContent(content string) error {
if err := d.validate(); err != nil {
return err
contentLines := strings.Split(content, "\n")
beforeLines := d.lines[0:d.gTLDSpan.startIndex]
afterLines := d.lines[d.gTLDSpan.endIndex:]
newLines := append(beforeLines, append(contentLines, afterLines...)...)
// Update the span based on the new content length
d.gTLDSpan.endIndex = len(beforeLines) + len(contentLines)
// and update the data file lines
d.lines = newLines
return nil
// String returns the dat file's lines joined together.
func (d datFile) String() string {
return strings.Join(d.lines, "\n")
// readDatFile reads the contents of the PSL dat file from the provided path
// and returns a representation holding all of the lines and the span where the gTLD
// data is found within the dat file. An error is returned if the file can't be read
// or if the gTLD data span can't be found or is invalid.
func readDatFile(datFilePath string) (*datFile, error) {
pslDatBytes, err := os.ReadFile(datFilePath)
if err != nil {
return nil, err
return readDatFileContent(string(pslDatBytes))
func readDatFileContent(pslData string) (*datFile, error) {
pslDatLines := strings.Split(pslData, "\n")
headerIndex, footerIndex := 0, 0
for i := 0; i < len(pslDatLines); i++ {
line := pslDatLines[i]
if line == PSL_GTLDS_SECTION_HEADER && headerIndex == 0 {
// If the line matches the header and we haven't seen the header yet, capture
// the index
headerIndex = i
} else if line == PSL_GTLDS_SECTION_HEADER && headerIndex != 0 {
// If the line matches the header and we've already seen the header return
// an error. This is unexpected.
return nil, errMultipleHeaders
} else if line == PSL_GTLDS_SECTION_FOOTER && footerIndex == 0 {
// If the line matches the footer, capture the index. We don't need
// to consider the case where we've already seen a footer because we break
// below when we have both a header and footer index.
footerIndex = i
// Break when we have found one header and one footer.
if headerIndex != 0 && footerIndex != 0 {
if headerIndex == 0 {
return nil, errNoHeader
} else if footerIndex == 0 {
return nil, errNoFooter
datFile := &datFile{
lines: pslDatLines,
gTLDSpan: gTLDDatSpan{
startIndex: headerIndex + 1,
endIndex: footerIndex,
if err := datFile.validate(); err != nil {
return nil, err
return datFile, nil
// getData performs a HTTP GET request to the given URL and returns the
// response body bytes or returns an error. An HTTP response code other than
// http.StatusOK (200) is considered to be an error.
func getData(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code fetching data "+
"from %q : expected status %d got %d",
url, http.StatusOK, resp.StatusCode)
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
return respBody, nil
// filterGTLDs removes entries that are present in the legacyGTLDs map or have
// ContractTerminated equal to true, or a non-empty RemovalDate.
func filterGTLDs(entries []*pslEntry) []*pslEntry {
var filtered []*pslEntry
for _, entry := range entries {
if _, isLegacy := legacyGTLDs[entry.ALabel]; isLegacy {
if entry.ContractTerminated {
if entry.RemovalDate != "" {
filtered = append(filtered, entry)
return filtered
// getPSLEntries fetches a list of pslEntry objects (or returns an error) by:
// 1. getting the raw JSON data from the provided url string.
// 2. unmarshaling the JSON data to create pslEntry objects.
// 3. normalizing the pslEntry objects.
// 4. filtering out any legacy or contract terminated gTLDs
// If there are no pslEntry objects after unmarshaling the data in step 2 or
// filtering the gTLDs in step 4 it is considered an error condition.
func getPSLEntries(url string) ([]*pslEntry, error) {
respBody, err := getData(url)
if err != nil {
return nil, err
var results struct {
GTLDs []*pslEntry
if err := json.Unmarshal(respBody, &results); err != nil {
return nil, fmt.Errorf(
"unmarshaling ICANN gTLD JSON data: %v", err)
// We expect there to always be GTLD data. If there was none after unmarshaling
// then its likely the data format has changed or something else has gone wrong.
if len(results.GTLDs) == 0 {
return nil, errors.New("found no gTLD information after unmarshaling")
// Normalize each tldEntry. This will remove leading/trailing whitespace and
// populate the ULabel with the ALabel if the entry has no ULabel.
for _, tldEntry := range results.GTLDs {
filtered := filterGTLDs(results.GTLDs)
if len(filtered) == 0 {
return nil, errors.New(
"found no gTLD information after removing legacy and contract terminated gTLDs")
return filtered, nil
// renderTemplate renders the given template to the provided writer, using the
// templateData, or returns an error.
func renderTemplate(writer io.Writer, template *template.Template, templateData interface{}) error {
var buf bytes.Buffer
if err := template.Execute(&buf, templateData); err != nil {
return err
_, err := writer.Write(buf.Bytes())
if err != nil {
return err
return nil
// clock is a small interface that lets us mock time in unit tests.
type clock interface {
Now() time.Time
// realClock is an implementation of clock that uses time.Now() natively.
type realClock struct{}
// Now returns the current time.Time using the system clock.
func (c realClock) Now() time.Time {
return time.Now()
// renderHeader renders the pslHeaderTemplate to the writer or returns an error. The
// provided clock instance is used for the header last update timestamp. If no
// clk instance is provided realClock is used.
func renderHeader(writer io.Writer, clk clock) error {
if clk == nil {
clk = &realClock{}
templateData := struct {
URL string
Date time.Time
DateFormat string
Date: clk.Now().UTC(),
DateFormat: time.RFC3339,
return renderTemplate(writer, pslHeaderTemplate, templateData)
// renderData renders the given list of pslEntry objects using the pslTemplate.
// The rendered template data is written to the provided writer or an error is
// returned.
func renderData(writer io.Writer, entries []*pslEntry) error {
templateData := struct {
Entries []*pslEntry
Entries: entries,
return renderTemplate(writer, pslTemplate, templateData)
// Process handles updating a datFile with new gTLD content. If there are no
// gTLD updates the existing dat file's contents will be returned. If there are
// updates, the new updates will be spliced into place and the updated file contents
// returned.
func process(datFile *datFile, dataURL string, clk clock) (string, error) {
// Get the lines for the gTLD data span - this includes both the header with the
// date and the actual gTLD entries.
spanLines, err := datFile.getGTLDLines()
if err != nil {
return "", err
// Render a new header for the gTLD data.
var newHeaderBuf strings.Builder
if err := renderHeader(&newHeaderBuf, clk); err != nil {
return "", err
// Figure out how many lines the header with the dynamic date is.
newHeaderLines := strings.Split(newHeaderBuf.String(), "\n")
headerLen := len(newHeaderLines)
// We should have at least that many lines in the existing span data.
if len(spanLines) <= headerLen {
return "", errors.New("gtld span data was too small, missing header?")
// The gTLD data can be found by skipping the header lines
existingData := strings.Join(spanLines[headerLen:], "\n")
// Fetch new PSL entries.
entries, err := getPSLEntries(dataURL)
if err != nil {
return "", err
// Render the new gTLD PSL section with the new entries.
var newDataBuf strings.Builder
if err := renderData(&newDataBuf, entries); err != nil {
return "", err
// If the newly rendered data doesn't match the existing data then we want to
// update the dat file content by replacing the old span with the new content.
if newDataBuf.String() != existingData {
newContent := newHeaderBuf.String() + "\n" + newDataBuf.String()
if err := datFile.ReplaceGTLDContent(newContent); err != nil {
return "", err
return datFile.String(), nil
func main() {
ifErrQuit := func(err error) {
if err != nil {
fmt.Fprintf(os.Stderr, "error updating gTLD data: %v\n", err)
pslDatFile := flag.String(
"file path to the public_suffix_list.dat data file to be updated with new gTLDs")
overwrite := flag.Bool(
"overwrite -psl-dat-file with the new data instead of printing to stdout")
// Parse CLI flags.
// Read the existing file content and find the span that contains the gTLD data.
datFile, err := readDatFile(*pslDatFile)
// Process the dat file.
content, err := process(datFile, ICANN_GTLD_JSON_URL, nil)
// If we're not overwriting the file, print the content to stdout.
if !*overwrite {
// Otherwise print nothing to stdout and write the content over the exiting
// pslDatFile path we read earlier.
err = os.WriteFile(*pslDatFile, []byte(content), 0644)