checker/bin/query-github.sh - typetools/checker-framework - Git at Google

 #!/bin/sh

 # This script collects a list of projects that match a query from GitHub.

 # inputs:
 #
 # The file git-personal-access-token must exist in the directory from which
 # this script is run, and must be a valid github OAuth token.  The token only
 # needs the "Access public repositories" permission.
 #
 # $1 is the query file, which should contain the literal string to use
 # as the github search. REQUIRED, no default
 #
 # $2 is the number of pages to search. default 1

 # Set to 1 to enable debug output from this script.
 DEBUG=0

 query_file=$1
 # Number of times to retry a GitHub search query.
 query_tries=5

 if [ -z "${query_file}" ]; then
     echo "you must provide a query file as the first argument"
     exit 2
 fi

 if [ -z "$2" ]; then
     page_count=1
 else
     page_count=$2
 fi

 query=$(tr ' ' '+' < "${query_file}")

 mkdir -p "/tmp/$USER"

 ## for storing the results before sorting and uniqing them
 rm -f "/tmp/$USER/github-query-results-*.txt"
 tempfile=$(mktemp "/tmp/$USER/github-query-results-$(date +%Y%m%d-%H%M%S)-XXX.txt")
 #trap "rm -f ${tempfile}" 0 2 3 15

 rm -f "/tmp/$USER/github-hash-results-*.txt"
 hashfile=$(mktemp "/tmp/$USER/github-hash-results-$(date +%Y%m%d-%H%M%S)-XXX.txt")
 #trap "rm -f ${hashfile}" 0 2 3 15

 rm -rf "/tmp/$USER/curl-output-*.txt"
 curl_output_file=$(mktemp "/tmp/$USER/curl-output-$(date +%Y%m%d-%H%M%S)-XXX.txt")

 # find the repos
 for i in $(seq "${page_count}"); do
     # GitHub only allows 30 searches per minute, so add a delay to each request.
     if [ "${i}" -gt 1 ]; then
         sleep 5
     fi

     full_query='https://api.github.com/search/code?q='${query}'&page='${i}
     if [ $DEBUG -ne 0 ] ; then
         echo "full_query=$full_query"
     fi
     for tries in $(seq ${query_tries}); do
         status_code=$(curl -s \
             -H "Authorization: token $(cat git-personal-access-token)" \
             -H "Accept: application/vnd.github.v3+json" \
             -w "%{http_code}" \
             -o "${curl_output_file}" \
             "${full_query}")

         if [ "${status_code}" -eq 200 ] || [ "${status_code}" -eq 422 ]; then
             # Don't retry.
             # 200 is success.  422 means too many GitHub requests.
             break
         elif [ "${tries}" -lt $((query_tries - 1)) ]; then
             # Retry.
             # Other status codes are failures. Failures are usually due to
             # triggering the abuse detection mechanism for sending too many
             # requests, so we add a delay when this happens.
             sleep 20
         fi
     done

     # GitHub only returns the first 1000 results. Requests past this limit
     # return 422, so stop making requests.
     if [ "${status_code}" -eq 422 ]; then
         break;
     elif [ "${status_code}" -ne 200 ]; then
         echo "GitHub query failed, last response:"
         cat "${curl_output_file}"
         rm -f "${curl_output_file}"
         exit 1
     fi

     grep "        \"html_url" < "${curl_output_file}" \
         | grep -v "          " \
         | sort -u \
         | cut -d \" -f 4 >> "${tempfile}"
 done

 rm -f "${curl_output_file}"

 # Each loop iteration was sorted and unique; this does it for the full result.
 sort -u -o "${tempfile}" "${tempfile}"

 while IFS= read -r line
 do
     repo=$(echo "${line}" | cut -d / -f 5)
     owner=$(echo "${line}" | cut -d / -f 4)
     hash_query='https://api.github.com/repos/'${owner}'/'${repo}'/commits?per_page=1'
     curl -sH "Authorization: token $(cat git-personal-access-token)" \
              "Accept: application/vnd.github.v3+json" \
              "${hash_query}" \
         | grep '^    "sha":' \
         | cut -d \" -f 4 >> "${hashfile}"
 done < "${tempfile}"

 paste "${tempfile}" "${hashfile}"
	#!/bin/sh

	# This script collects a list of projects that match a query from GitHub.

	# inputs:
	#
	# The file git-personal-access-token must exist in the directory from which
	# this script is run, and must be a valid github OAuth token. The token only
	# needs the "Access public repositories" permission.
	#
	# $1 is the query file, which should contain the literal string to use
	# as the github search. REQUIRED, no default
	#
	# $2 is the number of pages to search. default 1

	# Set to 1 to enable debug output from this script.
	DEBUG=0

	query_file=$1
	# Number of times to retry a GitHub search query.
	query_tries=5

	if [ -z "${query_file}" ]; then
	echo "you must provide a query file as the first argument"
	exit 2
	fi

	if [ -z "$2" ]; then
	page_count=1
	else
	page_count=$2
	fi

	query=$(tr ' ' '+' < "${query_file}")

	mkdir -p "/tmp/$USER"

	## for storing the results before sorting and uniqing them
	rm -f "/tmp/$USER/github-query-results-*.txt"
	tempfile=$(mktemp "/tmp/$USER/github-query-results-$(date +%Y%m%d-%H%M%S)-XXX.txt")
	#trap "rm -f ${tempfile}" 0 2 3 15

	rm -f "/tmp/$USER/github-hash-results-*.txt"
	hashfile=$(mktemp "/tmp/$USER/github-hash-results-$(date +%Y%m%d-%H%M%S)-XXX.txt")
	#trap "rm -f ${hashfile}" 0 2 3 15

	rm -rf "/tmp/$USER/curl-output-*.txt"
	curl_output_file=$(mktemp "/tmp/$USER/curl-output-$(date +%Y%m%d-%H%M%S)-XXX.txt")

	# find the repos
	for i in $(seq "${page_count}"); do
	# GitHub only allows 30 searches per minute, so add a delay to each request.
	if [ "${i}" -gt 1 ]; then
	sleep 5
	fi

	full_query='https://api.github.com/search/code?q='${query}'&page='${i}
	if [ $DEBUG -ne 0 ] ; then
	echo "full_query=$full_query"
	fi
	for tries in $(seq ${query_tries}); do
	status_code=$(curl -s \
	-H "Authorization: token $(cat git-personal-access-token)" \
	-H "Accept: application/vnd.github.v3+json" \
	-w "%{http_code}" \
	-o "${curl_output_file}" \
	"${full_query}")

	if [ "${status_code}" -eq 200 ] \|\| [ "${status_code}" -eq 422 ]; then
	# Don't retry.
	# 200 is success. 422 means too many GitHub requests.
	break
	elif [ "${tries}" -lt $((query_tries - 1)) ]; then
	# Retry.
	# Other status codes are failures. Failures are usually due to
	# triggering the abuse detection mechanism for sending too many
	# requests, so we add a delay when this happens.
	sleep 20
	fi
	done

	# GitHub only returns the first 1000 results. Requests past this limit
	# return 422, so stop making requests.
	if [ "${status_code}" -eq 422 ]; then
	break;
	elif [ "${status_code}" -ne 200 ]; then
	echo "GitHub query failed, last response:"
	cat "${curl_output_file}"
	rm -f "${curl_output_file}"
	exit 1
	fi

	grep " \"html_url" < "${curl_output_file}" \
	\| grep -v " " \
	\| sort -u \
	\| cut -d \" -f 4 >> "${tempfile}"
	done

	rm -f "${curl_output_file}"

	# Each loop iteration was sorted and unique; this does it for the full result.
	sort -u -o "${tempfile}" "${tempfile}"

	while IFS= read -r line
	do
	repo=$(echo "${line}" \| cut -d / -f 5)
	owner=$(echo "${line}" \| cut -d / -f 4)
	hash_query='https://api.github.com/repos/'${owner}'/'${repo}'/commits?per_page=1'
	curl -sH "Authorization: token $(cat git-personal-access-token)" \
	"Accept: application/vnd.github.v3+json" \
	"${hash_query}" \
	\| grep '^ "sha":' \
	\| cut -d \" -f 4 >> "${hashfile}"
	done < "${tempfile}"

	paste "${tempfile}" "${hashfile}"