Kick training job fail at local mode
Opened this issue · 0 comments
gatheluck commented
When I run the notebook code of a-sagemaker-test
at local mode. Following error is raised at cifar10_estimator.fit(inputs)
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
<ipython-input-6-2b195ccc4d5d> in <module>
1 # Kick traing job
----> 2 cifar10_estimator.fit(inputs)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
687 self._prepare_for_training(job_name=job_name)
688
--> 689 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
690 self.jobs.append(self.latest_training_job)
691 if wait:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
1469 """
1470 train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 1471 estimator.sagemaker_session.train(**train_args)
1472
1473 return cls(estimator.sagemaker_session, estimator._current_job_name)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
583 LOGGER.info("Creating training-job with name: %s", job_name)
584 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 585 self.sagemaker_client.create_training_job(**train_request)
586
587 def _get_train_request( # noqa: C901
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
184 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
185 logger.info("Starting training job")
--> 186 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
187
188 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
219
220 self.model_artifacts = self.container.train(
--> 221 input_data_config, output_data_config, hyperparameters, job_name
222 )
223 self.end_time = datetime.datetime.now()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
229
230 if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
--> 231 _pull_image(self.image)
232
233 process = subprocess.Popen(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/image.py in _pull_image(image)
1094 logger.info("docker command: %s", pull_image_command)
1095
-> 1096 subprocess.check_output(pull_image_command.split())
1097 logger.info("image pulled: %s", image)
~/anaconda3/envs/pytorch_p36/lib/python3.6/subprocess.py in check_output(timeout, *popenargs, **kwargs)
354
355 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 356 **kwargs).stdout
357
358
~/anaconda3/envs/pytorch_p36/lib/python3.6/subprocess.py in run(input, timeout, check, *popenargs, **kwargs)
436 if check and retcode:
437 raise CalledProcessError(retcode, process.args,
--> 438 output=stdout, stderr=stderr)
439 return CompletedProcess(process.args, retcode, stdout, stderr)
440
CalledProcessError: Command '['docker', 'pull', '763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/pytorch-training:1.7.1-cpu-py3']' returned non-zero exit status 1.