forked from Azure/azureml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathamlarc-python-sdk-train-test.yml
More file actions
155 lines (153 loc) · 7.41 KB
/
amlarc-python-sdk-train-test.yml
File metadata and controls
155 lines (153 loc) · 7.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
name: amlarc-python-sdk-train-test
on:
schedule:
- cron: "0 6,18 * * *"
workflow_dispatch:
inputs:
AMLARC_RELEASE_TRAIN:
description: 'Release version: experimental, staging or stable'
required: true
default: 'experimental'
jobs:
cpu-test:
runs-on: ubuntu-latest
env:
INPUT_AMLARC_RELEASE_TRAIN: ${{ github.event.inputs.AMLARC_RELEASE_TRAIN }}
ARC_CLUSTER_PREFIX: amlarc-arc-py
AKS_CLUSTER_PREFIX: amlarc-aks-py
WORKSPACE: amlarc-ws-py
steps:
- name: check out repo
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: pip install
run: pip install -r python-sdk/requirements.txt
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZ_AE_CREDS}}
- name: install tools
run: |
bash cli/amlarc-compute.sh install_tools
bash cli/amlarc-compute.sh prepare_attach_compute_py
- name: setup STANDARD_DS3_V2 cluster # cpu-cluster
run: bash cli/amlarc-compute.sh setup_cluster STANDARD_DS3_V2 5 10
- name: setup cpu-cluster compute
run: bash cli/amlarc-compute.sh setup_compute STANDARD_DS3_V2 cpu-cluster
- name: install azmlcli
run: az extension add -n azure-cli-ml -y
- name: get workspace config
run: bash cli/amlarc-compute.sh attach_workspace
- name: run python-sdk/workflows/train/fastai/mnist/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/fastai/mnist/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/fastai/mnist-mlproject/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/fastai/mnist-mlproject/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/lightgbm/iris/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/lightgbm/iris/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/scikit-learn/diabetes/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/scikit-learn/diabetes/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/scikit-learn/diabetes-mlproject/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/scikit-learn/diabetes-mlproject/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/xgboost/iris/job.py # cpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/xgboost/iris/job.py
continue-on-error: true
- name: uninstall azmlcli
if: ${{ always() }}
run: az extension remove -n azure-cli-ml
- name: clean up STANDARD_DS3_V2 cluster
if: ${{ always() }}
run: bash cli/amlarc-compute.sh clean_up_cluster STANDARD_DS3_V2
- name: count result
if: ${{ always() }}
run: bash cli/amlarc-compute.sh count_result
gpu-test:
runs-on: ubuntu-latest
env:
INPUT_AMLARC_RELEASE_TRAIN: ${{ github.event.inputs.AMLARC_RELEASE_TRAIN }}
ARC_CLUSTER_PREFIX: amlarc-arc-py
AKS_CLUSTER_PREFIX: amlarc-aks-py
WORKSPACE: main-python-sdk-amlarc
steps:
- name: check out repo
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: pip install
run: pip install -r python-sdk/requirements.txt
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZ_AE_CREDS}}
- name: init env
run: |
bash cli/amlarc-compute.sh install_tools
bash cli/amlarc-compute.sh prepare_attach_compute_py
#- name: setup STANDARD_NC6 cluster # gpu-cluster
# run: bash cli/amlarc-compute.sh setup_cluster STANDARD_NC6 4 4
- name: setup STANDARD_NC12 cluster # gpu-K80-2
run: bash cli/amlarc-compute.sh setup_cluster STANDARD_NC12 4 4
- name: setup gpu-cluster compute
run: bash cli/amlarc-compute.sh setup_compute STANDARD_NC12 gpu-cluster
- name: setup gpu-K80-2 compute
run: bash cli/amlarc-compute.sh setup_compute STANDARD_NC12 gpu-K80-2
- name: install azmlcli
run: az extension add -n azure-cli-ml -y
- name: attach to workspace
run: bash cli/amlarc-compute.sh attach_workspace
# Skipping v100 test cases because of lacking of quota
# - name: run train/deepspeed/cifar job # gpu-V100-2
# run: python python-sdk/workflows/train/deepspeed/cifar/job.py
# - name: run train/deepspeed/cifar job # gpu-V100-4
# run: python python-sdk/workflows/train/deepspeed/transformers/job.py
- name: run python-sdk/workflows/train/fastai/pets/job.py # gpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/fastai/pets/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/pytorch/cifar-distributed/job.py # gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/pytorch/cifar-distributed/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/pytorch/mnist/job.py # gpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/pytorch/mnist/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/pytorch/mnist-mlproject/job.py # gpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/pytorch/mnist-mlproject/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/tensorflow/mnist-distributed-horovod/job.py # gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/tensorflow/mnist-distributed-horovod/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/tensorflow/mnist-distributed/job.py # gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/tensorflow/mnist-distributed/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/tensorflow/mnist/job.py # gpu-cluster
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/tensorflow/mnist/job.py
continue-on-error: true
- name: run python-sdk/workflows/train/transformers/glue/1-aml-finetune-job.py # gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/transformers/glue/1-aml-finetune-job.py
continue-on-error: true
- name: run python-sdk/workflows/train/transformers/glue/2-aml-comparison-of-sku-job.py # gpu-cluster gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/transformers/glue/2-aml-comparison-of-sku-job.py
continue-on-error: true
- name: run python-sdk/workflows/train/transformers/glue/3-aml-hyperdrive-job.py # gpu-K80-2
run: bash cli/amlarc-compute.sh run_py_test python-sdk/workflows/train/transformers/glue/3-aml-hyperdrive-job.py
continue-on-error: true
- name: uninstall azmlcli
if: ${{ always() }}
run: az extension remove -n azure-cli-ml
#- name: clean up STANDARD_NC6 cluster
# if: ${{ always() }}
# run: bash cli/amlarc-compute.sh clean_up_cluster STANDARD_NC6
- name: clean up STANDARD_NC12 cluster
if: ${{ always() }}
run: bash cli/amlarc-compute.sh clean_up_cluster STANDARD_NC12
- name: count result
if: ${{ always() }}
run: bash cli/amlarc-compute.sh count_result