blob: 32c85d55ec0445aa5ed3c02c809df335231db838 [file] [log] [blame]
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package transport
import (
"fmt"
"io"
"log"
"net"
"net/url"
"regexp"
"strings"
"time"
"google.golang.org/api/googleapi"
"google.golang.org/genproto/googleapis/rpc/errdetails"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
type RetryErrorPredicateFunc func(error) (bool, string)
/** ADD GLOBAL ERROR RETRY PREDICATES HERE **/
// Retry predicates that shoud apply to all requests should be added here.
var defaultErrorRetryPredicates = []RetryErrorPredicateFunc{
// Common network errors (usually wrapped by URL error)
isNetworkTemporaryError,
isNetworkTimeoutError,
isIoEOFError,
isConnectionResetNetworkError,
// Common GCP error codes
isCommonRetryableErrorCode,
//While this might apply only to Cloud SQL, historically,
// we had this in our global default error retries.
// Keeping it as a default for now.
is409OperationInProgressError,
// GCE Error codes- we don't have a way to add these to all GCE resources
// easily, so add them globally.
// GCE Subnetworks are considered unready for a brief period when certain
// operations are performed on them, and the scope is likely too broad to
// apply a mutex. If we attempt an operation w/ an unready subnetwork, retry
// it.
isSubnetworkUnreadyError,
// As of February 2022 GCE seems to have added extra quota enforcement on
// reads, causing significant failure for our CI and for large customers.
// GCE returns the wrong error code, as this should be a 429, which we retry
// already.
is403QuotaExceededPerMinuteError,
}
/** END GLOBAL ERROR RETRY PREDICATES HERE **/
func isNetworkTemporaryError(err error) (bool, string) {
if netErr, ok := err.(*net.OpError); ok && netErr.Temporary() {
return true, "marked as timeout"
}
if urlerr, ok := err.(*url.Error); ok && urlerr.Temporary() {
return true, "marked as timeout"
}
return false, ""
}
func isNetworkTimeoutError(err error) (bool, string) {
if netErr, ok := err.(*net.OpError); ok && netErr.Timeout() {
return true, "marked as timeout"
}
if urlerr, ok := err.(*url.Error); ok && urlerr.Timeout() {
return true, "marked as timeout"
}
return false, ""
}
func isIoEOFError(err error) (bool, string) {
if err == io.ErrUnexpectedEOF {
return true, "Got unexpected EOF"
}
if urlerr, urlok := err.(*url.Error); urlok {
wrappedErr := urlerr.Unwrap()
if wrappedErr == io.ErrUnexpectedEOF {
return true, "Got unexpected EOF"
}
}
return false, ""
}
const connectionResetByPeerErr = ": connection reset by peer"
func isConnectionResetNetworkError(err error) (bool, string) {
if strings.HasSuffix(err.Error(), connectionResetByPeerErr) {
return true, fmt.Sprintf("reset connection error: %v", err)
}
return false, ""
}
// Retry 409s because some APIs like Cloud SQL throw a 409 if concurrent calls
// are being made.
//
// The only way right now to determine it is a retryable 409 due to
// concurrent calls is to look at the contents of the error message.
// See https://github.com/hashicorp/terraform-provider-google/issues/3279
func is409OperationInProgressError(err error) (bool, string) {
gerr, ok := err.(*googleapi.Error)
if !ok {
return false, ""
}
if gerr.Code == 409 && strings.Contains(gerr.Body, "operationInProgress") {
log.Printf("[DEBUG] Dismissed an error as retryable based on error code 409 and error reason 'operationInProgress': %s", err)
return true, "Operation still in progress"
}
return false, ""
}
func isSubnetworkUnreadyError(err error) (bool, string) {
gerr, ok := err.(*googleapi.Error)
if !ok {
return false, ""
}
if gerr.Code == 400 && strings.Contains(gerr.Body, "resourceNotReady") && strings.Contains(gerr.Body, "subnetworks") {
log.Printf("[DEBUG] Dismissed an error as retryable based on error code 400 and error reason 'resourceNotReady' w/ `subnetwork`: %s", err)
return true, "Subnetwork not ready"
}
return false, ""
}
// GCE (and possibly other APIs) incorrectly return a 403 rather than a 429 on
// rate limits.
func is403QuotaExceededPerMinuteError(err error) (bool, string) {
gerr, ok := err.(*googleapi.Error)
if !ok {
return false, ""
}
var QuotaRegex = regexp.MustCompile(`Quota exceeded for quota metric '(?P<Metric>.*)' and limit '(?P<Limit>.* per minute)' of service`)
if gerr.Code == 403 && QuotaRegex.MatchString(gerr.Body) {
matches := QuotaRegex.FindStringSubmatch(gerr.Body)
metric := matches[QuotaRegex.SubexpIndex("Metric")]
limit := matches[QuotaRegex.SubexpIndex("Limit")]
log.Printf("[DEBUG] Dismissed an error as retryable based on error code 403 and error message 'Quota exceeded for quota metric `%s`: %s", metric, err)
return true, fmt.Sprintf("Waiting for quota limit %s to refresh", limit)
}
return false, ""
}
// We've encountered a few common fingerprint-related strings; if this is one of
// them, we're confident this is an error due to fingerprints.
var FINGERPRINT_FAIL_ERRORS = []string{"Invalid fingerprint.", "Supplied fingerprint does not match current metadata fingerprint."}
// Retry the operation if it looks like a fingerprint mismatch.
func IsFingerprintError(err error) (bool, string) {
gerr, ok := err.(*googleapi.Error)
if !ok {
return false, ""
}
if gerr.Code != 412 {
return false, ""
}
for _, msg := range FINGERPRINT_FAIL_ERRORS {
if strings.Contains(err.Error(), msg) {
return true, "fingerprint mismatch"
}
}
return false, ""
}
const METADATA_FINGERPRINT_RETRIES = 10
// Since the google compute API uses optimistic locking, there is a chance
// we need to resubmit our updated metadata. To do this, you need to provide
// an update function that attempts to submit your metadata
func MetadataRetryWrapper(update func() error) error {
attempt := 0
for attempt < METADATA_FINGERPRINT_RETRIES {
err := update()
if err == nil {
return nil
}
if ok, _ := IsFingerprintError(err); !ok {
// Something else went wrong, don't retry
return err
}
log.Printf("[DEBUG] Dismissed an error as retryable as a fingerprint mismatch: %s", err)
attempt++
}
return fmt.Errorf("Failed to update metadata after %d retries", attempt)
}
// If a permission necessary to provision a resource is created in the same config
// as the resource itself, the permission may not have propagated by the time terraform
// attempts to create the resource. This allows those errors to be retried until the timeout expires
func IamMemberMissing(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "permission") {
return true, "Waiting for IAM member permissions to propagate."
}
}
return false, ""
}
// Retry if Cloud SQL operation returns a 429 with a specific message for
// concurrent operations.
func IsSqlOperationInProgressError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok && gerr.Code == 409 {
if strings.Contains(gerr.Body, "instanceAlreadyExists") {
return false, ""
}
return true, "Waiting for other concurrent Cloud SQL operations to finish"
}
return false, ""
}
// Retry if service usage decides you're activating the same service multiple
// times. This can happen if a service and a dependent service aren't batched
// together- eg container.googleapis.com in one request followed by compute.g.c
// in the next (container relies on compute and implicitly activates it)
func ServiceUsageServiceBeingActivated(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok && gerr.Code == 400 {
if strings.Contains(gerr.Body, "Another activation or deactivation is in progress") {
return true, "Waiting for same service activation/deactivation to finish"
}
return false, ""
}
return false, ""
}
// See https://github.com/hashicorp/terraform-provider-google/issues/14691 for
// details on the error message this handles
// This is a post-operation error so it uses tpgresource.CommonOpError instead of googleapi.Error
func ServiceUsageInternalError160009(err error) (bool, string) {
// a cyclical dependency between transport/tpgresource blocks using tpgresource.CommonOpError
// so just work off the error string. Ideally, we'd use that type instead.
s := err.Error()
if strings.Contains(s, "encountered internal error") && strings.Contains(s, "160009") && strings.Contains(s, "with failed services") {
return true, "retrying internal error 160009."
}
return false, ""
}
// Retry if Bigquery operation returns a 403 with a specific message for
// concurrent operations (which are implemented in terms of 'edit quota').
func IsBigqueryIAMQuotaError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 403 && strings.Contains(strings.ToLower(gerr.Body), "exceeded rate limits") {
return true, "Waiting for Bigquery edit quota to refresh"
}
}
return false, ""
}
// Retry if Monitoring operation returns a 409 with a specific message for
// concurrent operations.
func IsMonitoringConcurrentEditError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 && (strings.Contains(strings.ToLower(gerr.Body), "too many concurrent edits") ||
strings.Contains(strings.ToLower(gerr.Body), "could not fulfill the request")) {
return true, "Waiting for other Monitoring changes to finish"
}
}
return false, ""
}
// Retry if Monitoring operation returns a 403
func IsMonitoringPermissionError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 403 {
return true, "Waiting for project to be ready for metrics scope"
}
}
return false, ""
}
// Retry if KMS CryptoKeyVersions returns a 400 for PENDING_GENERATION
func IsCryptoKeyVersionsPendingGeneration(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok && gerr.Code == 400 {
if strings.Contains(gerr.Body, "PENDING_GENERATION") {
return true, "Waiting for pending key generation"
}
}
return false, ""
}
// Retry if getting a resource/operation returns a 404 for specific operations.
// opType should describe the operation for which 404 can be retryable.
func IsNotFoundRetryableError(opType string) RetryErrorPredicateFunc {
return func(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok && gerr.Code == 404 {
return true, fmt.Sprintf("Retry 404s for %s", opType)
}
return false, ""
}
}
func IsPeeringOperationInProgress(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "There is a peering operation in progress") {
return true, "Waiting peering operation to complete"
}
}
return false, ""
}
func DatastoreIndex409Contention(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 && strings.Contains(gerr.Body, "too much contention") {
return true, "too much contention - waiting for less activity"
}
}
return false, ""
}
// relevant for firestore in datastore mode
func FirestoreField409RetryUnderlyingDataChanged(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 && strings.Contains(gerr.Body, "Please retry, underlying data changed") {
return true, "underlying data changed - retrying"
}
}
return false, ""
}
// relevant for firestore in datastore mode
func FirestoreIndex409Retry(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok && gerr.Code == 409 {
if strings.Contains(gerr.Body, "Aborted due to cross-transaction contention") {
return true, "aborted due to cross-transaction contention - retrying"
}
if strings.Contains(gerr.Body, "Please retry, underlying data changed") {
return true, "underlying data changed - retrying"
}
}
return false, ""
}
func IapClient409Operation(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 && strings.Contains(strings.ToLower(gerr.Body), "operation was aborted") {
return true, "operation was aborted possibly due to concurrency issue - retrying"
}
}
return false, ""
}
func HealthcareDatasetNotInitialized(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 404 && strings.Contains(strings.ToLower(gerr.Body), "dataset not initialized") {
return true, "dataset not initialized - retrying"
}
}
return false, ""
}
// Cloud Run APIs may return a 409 on create to indicate that a resource has been deleted in the foreground
// (eg GET and LIST) but not the backing apiserver. When we encounter a 409, we can retry it.
// Note that due to limitations in MMv1's error_retry_predicates this is currently applied to all requests.
// We only expect to receive it on create, though.
func IsCloudRunCreationConflict(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 {
return true, "saw a 409 - waiting until background deletion completes"
}
}
return false, ""
}
// If a service account is deleted in the middle of updating an IAM policy
// it can cause the API to return an error. In fine-grained IAM resources we
// read the policy, modify it, then send it back to the API. Retrying is
// useful particularly in high-traffic projects.
// We don't want to retry _every_ time we see this error because the
// user-provided SA could trigger this too. At the callsite, we should check
// if the current etag matches the old etag and short-circuit if they do as
// that indicates the new config is the likely problem.
func IamServiceAccountNotFound(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "Service account") && strings.Contains(gerr.Body, "does not exist") {
return true, "service account not found in IAM"
}
}
return false, ""
}
// Concurrent Apigee operations can fail with a 400 error
func IsApigeeRetryableError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(strings.ToLower(gerr.Body), "the resource is locked by another operation") {
return true, "Waiting for other concurrent operations to finish"
}
}
return false, ""
}
func IsDataflowJobUpdateRetryableError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 404 && strings.Contains(gerr.Body, "in RUNNING OR DRAINING state") {
return true, "Waiting for job to be in a valid state"
}
}
return false, ""
}
// Cloud PubSub returns a 400 error if a topic's parent project was recently created and an
// organization policy has not propagated.
// See https://github.com/hashicorp/terraform-provider-google/issues/4349
func PubsubTopicProjectNotReady(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "retry this operation") {
log.Printf("[DEBUG] Dismissed error as a retryable operation: %s", err)
return true, "Waiting for Pubsub topic's project to properly initialize with organiation policy"
}
}
return false, ""
}
// Retry on comon googleapi error codes for retryable errors.
// TODO(#5609): This may not need to be applied globally - figure out
// what retryable error codes apply to which API.
func isCommonRetryableErrorCode(err error) (bool, string) {
gerr, ok := err.(*googleapi.Error)
if !ok {
return false, ""
}
if gerr.Code == 429 || gerr.Code == 500 || gerr.Code == 502 || gerr.Code == 503 {
log.Printf("[DEBUG] Dismissed an error as retryable based on error code: %s", err)
return true, fmt.Sprintf("Retryable error code %d", gerr.Code)
}
return false, ""
}
// Do not retry if operation returns a 429
func Is429QuotaError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 429 {
return true, "429s are not retryable for this resource"
}
}
return false, ""
}
// Retry if App Engine operation returns a 409 with a specific message for
// concurrent operations, or a 404 indicating p4sa has not yet propagated.
func IsAppEngineRetryableError(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 409 && strings.Contains(strings.ToLower(gerr.Body), "operation is already in progress") {
return true, "Waiting for other concurrent App Engine changes to finish"
}
if gerr.Code == 404 && strings.Contains(strings.ToLower(gerr.Body), "unable to retrieve p4sa") {
return true, "Waiting for P4SA propagation to GAIA"
}
}
return false, ""
}
// Bigtable uses gRPC and thus does not return errors of type *googleapi.Error.
// Instead the errors returned are *status.Error. See the types of codes returned
// here (https://pkg.go.dev/google.golang.org/grpc/codes#Code).
func IsBigTableRetryableError(err error) (bool, string) {
// The error is retryable if the error code is not OK and has a retry delay.
// The retry delay is currently not used.
if errorStatus, ok := status.FromError(err); ok && errorStatus.Code() != codes.OK {
var retryDelayDuration time.Duration
for _, detail := range errorStatus.Details() {
retryInfo, ok := detail.(*errdetails.RetryInfo)
if !ok {
continue
}
retryDelay := retryInfo.GetRetryDelay()
retryDelayDuration = time.Duration(retryDelay.Seconds)*time.Second + time.Duration(retryDelay.Nanos)*time.Nanosecond
break
}
if retryDelayDuration != 0 {
// TODO: Consider sleep for `retryDelayDuration` before retrying.
return true, "Bigtable operation failed with a retryable error, will retry"
}
}
return false, ""
}
// Gateway of type 'SECURE_WEB_GATEWAY' automatically creates a router but does not delete it.
// This router might be re-used by other gateways located in the same network.
// When multiple gateways are being deleted at the same time, multiple attempts to delete the
// same router will be triggered and the api throws an error saying the "The resource <router> is not ready".
func IsSwgAutogenRouterRetryable(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(strings.ToLower(gerr.Body), "not ready") {
return true, "Waiting swg autogen router to be ready"
}
}
return false, ""
}
// Retry if getting a resource/operation returns a 403 for specific IAM Admin API Service Account operations.
// opType should describe the operation for which it can be retryable.
// IAM API is eventually consistent and returns 403 Forbidden (instead of 404 Not found) for some operations
// when a newly created resource is attempted to be read right after the creation and not found.
func IsForbiddenIamServiceAccountRetryableError(opType string) RetryErrorPredicateFunc {
return func(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 403 && strings.Contains(gerr.Body, "Permission 'iam.serviceAccounts.get' denied on resource (or it may not exist)") {
return true, fmt.Sprintf("Retry 403s for %s", opType)
}
}
return false, ""
}
}
// Retry the creation of `google_vmwareengine_external_address` resource if the network policy's
// External IP field is not active yet.
func ExternalIpServiceNotActive(err error) (bool, string) {
if gerr, ok := err.(*googleapi.Error); ok {
if gerr.Code == 400 && strings.Contains(gerr.Body, "External IP address network service is not active in the provided network policy") {
return true, "Waiting for external ip service to be enabled"
}
}
return false, ""
}