gatheluck/AwsTerraform

Kick training job fail at local mode

Opened this issue · 0 comments

When I run the notebook code of a-sagemaker-test at local mode. Following error is raised at cifar10_estimator.fit(inputs)

---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
<ipython-input-6-2b195ccc4d5d> in <module>
      1 # Kick traing job
----> 2 cifar10_estimator.fit(inputs)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
    687         self._prepare_for_training(job_name=job_name)
    688 
--> 689         self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
    690         self.jobs.append(self.latest_training_job)
    691         if wait:

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
   1469         """
   1470         train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 1471         estimator.sagemaker_session.train(**train_args)
   1472 
   1473         return cls(estimator.sagemaker_session, estimator._current_job_name)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
    583         LOGGER.info("Creating training-job with name: %s", job_name)
    584         LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 585         self.sagemaker_client.create_training_job(**train_request)
    586 
    587     def _get_train_request(  # noqa: C901

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
    184         hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
    185         logger.info("Starting training job")
--> 186         training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
    187 
    188         LocalSagemakerClient._training_jobs[TrainingJobName] = training_job

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
    219 
    220         self.model_artifacts = self.container.train(
--> 221             input_data_config, output_data_config, hyperparameters, job_name
    222         )
    223         self.end_time = datetime.datetime.now()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
    229 
    230         if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
--> 231             _pull_image(self.image)
    232 
    233         process = subprocess.Popen(

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sagemaker/local/image.py in _pull_image(image)
   1094     logger.info("docker command: %s", pull_image_command)
   1095 
-> 1096     subprocess.check_output(pull_image_command.split())
   1097     logger.info("image pulled: %s", image)

~/anaconda3/envs/pytorch_p36/lib/python3.6/subprocess.py in check_output(timeout, *popenargs, **kwargs)
    354 
    355     return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 356                **kwargs).stdout
    357 
    358 

~/anaconda3/envs/pytorch_p36/lib/python3.6/subprocess.py in run(input, timeout, check, *popenargs, **kwargs)
    436         if check and retcode:
    437             raise CalledProcessError(retcode, process.args,
--> 438                                      output=stdout, stderr=stderr)
    439     return CompletedProcess(process.args, retcode, stdout, stderr)
    440 

CalledProcessError: Command '['docker', 'pull', '763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/pytorch-training:1.7.1-cpu-py3']' returned non-zero exit status 1.