This repository was archived by the owner on May 29, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 73
This repository was archived by the owner on May 29, 2025. It is now read-only.
Albert does not run locally due to FSx data source #43
Copy link
Copy link
Open
Description
Error:
root@6eca14cacd82:/shared/sm_bert_tf/tf2/deep-learning-models/models/nlp# ./launch.sh
wandb: WARNING W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.
2020-10-26 13:42:31.332706: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
Parameter distribution will be renamed to {'mpi': {'enabled': True, 'processes_per_host': 8, 'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'}} in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
DATA_SOURCE {'FileSystemDataSource': {'FileSystemId': 'fs-0d3f0dfc5f6428d70', 'FileSystemType': 'FSxLustre', 'DirectoryPath': '/fsx', 'FileSystemAccessMode': 'rw'}}
Traceback (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/shared/sm_bert_tf/tf2/deep-learning-models/models/nlp/albert/launch_sagemaker.py", line 63, in <module>
security_group_ids=sm_args.security_group_ids,
File "/shared/sm_bert_tf/tf2/deep-learning-models/models/nlp/common/sagemaker_utils.py", line 97, in launch_sagemaker_job
estimator_hvd.fit(fsx_input)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/tensorflow/estimator.py", line 483, in fit
fit_super()
File "/usr/local/lib/python3.7/dist-packages/sagemaker/tensorflow/estimator.py", line 462, in fit_super
super(TensorFlow, self).fit(inputs, wait, logs, job_name, experiment_config)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/estimator.py", line 496, in fit
self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/estimator.py", line 1091, in start_new
estimator.sagemaker_session.train(**train_args)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/session.py", line 590, in train
self.sagemaker_client.create_training_job(**train_request)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/local/local_session.py", line 102, in create_training_job
training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
File "/usr/local/lib/python3.7/dist-packages/sagemaker/local/entities.py", line 80, in start
"Need channel['DataSource'] to have ['S3DataSource'] or ['FileDataSource']"
ValueError: Need channel['DataSource'] to have ['S3DataSource'] or ['FileDataSource']
This is due to the FileSystemDataSource not being supported by _LocalTrainingJob
Metadata
Metadata
Assignees
Labels
No labels