Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ In this lab you will build Cloud Native infrastructure required for running dist
[Torch Elastic Docs](https://pytorch.org/elastic/0.2.2/index.html)

[Azure Spot VMs](https://docs.microsoft.com/en-us/azure/virtual-machines/spot-vms)

kustomize build kube/overlays/4gpu | kubectl apply -f -

kubectl apply -f kube/imagenet.yaml
87 changes: 87 additions & 0 deletions examples/imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
from torch.nn.parallel import DistributedDataParallel
from torch.optim import SGD
from torch.utils.data import DataLoader
import random


model_names = sorted(
Expand Down Expand Up @@ -145,6 +146,9 @@
type=str,
help="checkpoint file path, to load and save to",
)
parser.add_argument('--amp', action='store_true', help='use automatic mixed precision')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--accum-steps', type=int, default=1)


def main():
Expand Down Expand Up @@ -590,5 +594,88 @@ def accuracy(output, target, topk=(1,)):
return res


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="PyTorch Elastic ImageNet Training")
parser.add_argument("data", metavar="DIR", help="path to dataset")
parser.add_argument(
"-a",
"--arch",
metavar="ARCH",
default="resnet18",
choices=model_names,
help="model architecture: " + " | ".join(model_names) + " (default: resnet18)",
)
parser.add_argument(
"-j",
"--workers",
default=0,
type=int,
metavar="N",
help="number of data loading workers",
)
parser.add_argument(
"--epochs", default=90, type=int, metavar="N", help="number of total epochs to run"
)
parser.add_argument(
"-b",
"--batch-size",
default=32,
type=int,
metavar="N",
help="mini-batch size (default: 32), per worker (GPU)",
)
parser.add_argument(
"--lr",
"--learning-rate",
default=0.1,
type=float,
metavar="LR",
help="initial learning rate",
dest="lr",
)
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument(
"-p",
"--print-freq",
default=10,
type=int,
metavar="N",
help="print frequency (default: 10)",
)
parser.add_argument(
"--dist-backend",
default="nccl",
choices=["nccl", "gloo"],
type=str,
help="distributed backend",
)
parser.add_argument(
"--checkpoint-file",
default="/tmp/checkpoint.pth.tar",
type=str,
help="checkpoint file path, to load and save to",
)
parser.add_argument('--amp', action='store_true', help='use automatic mixed precision')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--accum-steps', type=int, default=1)
return parser.parse_args()


def set_seed(seed: int):
random.seed(seed)
numpy.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions kube/imagenet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
image: torchelastic/examples:0.2.0
imagePullPolicy: Always
args:
- "--nproc_per_node=1"
- "--nproc_per_node=4"
- "/workspace/examples/imagenet/main.py"
- "--arch=resnet18"
- "--epochs=3"
Expand All @@ -47,7 +47,7 @@ spec:
- "--checkpoint-file=/mnt/blob/data/checkpoint.pth.tar"
resources:
limits:
nvidia.com/gpu: 1
nvidia.com/gpu: 4
volumeMounts:
- name: trainingdata
mountPath: "/mnt/blob/data"
Expand Down