mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
139 lines
5.2 KiB
YAML
139 lines
5.2 KiB
YAML
name: Setup Linux
|
|
|
|
description: Set up Docker workspace on EC2
|
|
|
|
runs:
|
|
using: composite
|
|
steps:
|
|
- name: Display EC2 information
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
function get_ec2_metadata() {
|
|
# Pulled from instance metadata endpoint for EC2
|
|
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
category=$1
|
|
# If it is GCP runner (runner name contains gcp), do not run this
|
|
runner_name_str=${{ runner.name }}
|
|
if [[ -f /.inarc ]]; then
|
|
echo "ARC Runner, no info on ec2 metadata"
|
|
elif [[ $runner_name_str == *"gcp"* ]]; then
|
|
echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
|
|
else
|
|
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
fi
|
|
}
|
|
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
echo "system info $(uname -a)"
|
|
|
|
- name: Check if in a container runner
|
|
shell: bash
|
|
id: check_container_runner
|
|
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Start docker if docker deamon is not running
|
|
shell: bash
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
run: |
|
|
if systemctl is-active --quiet docker; then
|
|
echo "Docker daemon is running...";
|
|
else
|
|
echo "Starting docker deamon..." && sudo systemctl start docker;
|
|
fi
|
|
|
|
- name: Log in to ECR
|
|
uses: nick-fields/retry@v3.0.0
|
|
env:
|
|
AWS_RETRY_MODE: standard
|
|
AWS_MAX_ATTEMPTS: "5"
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
with:
|
|
shell: bash
|
|
timeout_minutes: 5
|
|
max_attempts: 3
|
|
retry_wait_seconds: 30
|
|
command: |
|
|
AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
|
|
aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
|
|
--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
|
|
|
|
# For LF Runners we need to make sure we also login to Meta's ECR docker registry too.
|
|
META_AWS_ACCOUNT_ID=308535385114
|
|
if [ "$AWS_ACCOUNT_ID" != "$META_AWS_ACCOUNT_ID" ] ; then
|
|
aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
|
|
--password-stdin "$META_AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
|
|
fi
|
|
|
|
- name: Preserve github env variables for use in docker
|
|
shell: bash
|
|
run: |
|
|
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
|
|
- name: Kill any existing containers, clean up images
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
shell: bash
|
|
run: |
|
|
# ignore expansion of "docker ps -q" since it could be empty
|
|
# shellcheck disable=SC2046
|
|
docker stop $(docker ps -q) || true
|
|
# Prune all of the docker images
|
|
docker system prune -af
|
|
|
|
- name: Manually resolve download.pytorch.org
|
|
shell: bash
|
|
continue-on-error: true
|
|
run: |
|
|
set +e
|
|
set -x
|
|
|
|
PT_DOMAIN=download.pytorch.org
|
|
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
|
|
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
|
|
# one is returned at random
|
|
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
|
|
|
|
if [ -z "${RESOLVED_IP}" ]; then
|
|
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
|
|
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
|
|
|
|
if [ -z "${RESOLVED_IP}" ]; then
|
|
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
if grep -r "${PT_DOMAIN}" /etc/hosts; then
|
|
# Clean up any old records first
|
|
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
|
|
fi
|
|
|
|
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
|
|
cat /etc/hosts
|
|
|
|
- name: Check that the docker daemon is running
|
|
shell: bash
|
|
continue-on-error: true
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
run: |
|
|
set +x
|
|
|
|
max_attempts=30
|
|
delay=10
|
|
attempt=1
|
|
|
|
for attempt in $(seq 1 $max_attempts); do
|
|
echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
|
|
if docker info > /dev/null 2>&1; then
|
|
echo "Docker is running. Proceeding with the next steps"
|
|
exit 0
|
|
else
|
|
echo "Docker is not running yet."
|
|
echo "Retrying in $delay seconds..."
|
|
sleep $delay
|
|
fi
|
|
done
|
|
echo "Reached maximum attempts to connect to Docker. Exiting."
|
|
exit 1
|