KubeCluster broken in latest image
Opened this issue · 10 comments
Using the latest onbuild notebook (pangeo/pangeo-notebook-onbuild:2019.12.08
), I can't start a KubeCluster
from dask_kubernetes import KubeCluster
cluster = KubeCluster()
TypeError: argument of type 'NoneType' is not iterable
For an example, see https://mybinder.org/v2/gh/rabernat/RCES-Final_Assignment/patch-2
Full traceback
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/core.py in __init__(self, pod_template, name, namespace, n_workers, host, port, env, auth, scheduler_timeout, deploy_mode, interface, protocol, dashboard_address, security, **kwargs)
378 self.auth = auth
379 self.kwargs = kwargs
--> 380 super().__init__(**self.kwargs)
381
382 async def _start(self):
/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
240 if not self.asynchronous:
241 self._loop_runner.start()
--> 242 self.sync(self._start)
243 self.sync(self._correct_state)
244
/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
160 return future
161 else:
--> 162 return sync(self.loop, func, *args, **kwargs)
163
164 async def _logs(self, scheduler=True, workers=True):
/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
332 if error[0]:
333 typ, exc, tb = error[0]
--> 334 raise exc.with_traceback(tb)
335 else:
336 return result[0]
/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/utils.py in f()
316 if callback_timeout is not None:
317 future = gen.with_timeout(timedelta(seconds=callback_timeout), future)
--> 318 result[0] = yield future
319 except Exception as exc:
320 error[0] = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.7/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/core.py in _start(self)
435
436 self.pod_template = clean_pod_template(self.pod_template)
--> 437 await ClusterAuth.load_first(self.auth)
438
439 self.core_api = kubernetes.client.CoreV1Api()
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/auth.py in load_first(auth)
68 for auth_instance in auth:
69 try:
---> 70 await auth_instance.load()
71 except kubernetes.config.ConfigException as exc:
72 logger.debug(
/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/auth.py in load(self)
119 async def load(self):
120 await kubernetes.config.load_kube_config(
--> 121 self.config_file, self.context, None, self.persist_config
122 )
123
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in load_kube_config(config_file, context, client_configuration, persist_config)
551 loader = _get_kube_config_loader_for_yaml_file(
552 config_file, active_context=context,
--> 553 persist_config=persist_config)
554 if client_configuration is None:
555 config = type.__call__(Configuration)
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in _get_kube_config_loader_for_yaml_file(filename, persist_config, **kwargs)
519 config_dict=kcfg.config,
520 config_base_path=None,
--> 521 **kwargs)
522
523
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in __init__(self, config_dict, active_context, get_google_credentials, config_base_path, config_persister)
142 self._cluster = None
143 self.provider = None
--> 144 self.set_active_context(active_context)
145 self._config_base_path = config_base_path
146 self._config_persister = config_persister
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in set_active_context(self, context_name)
152 def set_active_context(self, context_name=None):
153 if context_name is None:
--> 154 context_name = self._config['current-context']
155 self._current_context = self._config['contexts'].get_with_name(
156 context_name)
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in __getitem__(self, key)
403
404 def __getitem__(self, key):
--> 405 v = self.safe_get(key)
406 if not v:
407 raise ConfigException(
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in safe_get(self, key)
399 def safe_get(self, key):
400 if (isinstance(self.value, list) and isinstance(key, int) or
--> 401 key in self.value):
402 return self.value[key]
403
TypeError: argument of type 'NoneType' is not iterable
Looks like dask/dask-kubernetes#165. Looking into it now
One difference, on ocean.pangeo.io
, we have this
kubernetes.config.load_incluster_config() # passes fine
But on the binder, we have
>>> kubernetes.config.load_incluster_config()
ConfigException Traceback (most recent call last)
<ipython-input-7-bb26eae40ec2> in <module>
----> 1 kubernetes.config.load_incluster_config()
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in load_incluster_config()
94 not running in a kubernetes environment."""
95 InClusterConfigLoader(token_filename=SERVICE_TOKEN_FILENAME,
---> 96 cert_filename=SERVICE_CERT_FILENAME).load_and_set()
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in load_and_set(self)
45
46 def load_and_set(self):
---> 47 self._load_config()
48 self._set_config()
49
/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in _load_config(self)
62
63 if not os.path.isfile(self._token_filename):
---> 64 raise ConfigException("Service token file does not exists.")
65
66 with open(self._token_filename) as f:
ConfigException: Service token file does not exists.
This is straining my kubernetes knowledge. @jacobtomlinson do you have any thoughts here, and maybe @jhamman knows a bit more about the deployment differences between the binder and the pangeo-cloud-federation deployments?
Kubernetes clients generally authenticate with a local config in your home directory, or a service account config which gets mounted into a pod automatically.
The example from @TomAugspurger suggests that you are expecting a service account to be mounted in the pod, but one doesn't exist.
@rabernat's example seems to suggest that neither exists as dask-kubernetes tries both by default.
My gut feeling would be that this is a bug in dask-kubernetes rather than a cluster config issue. It would be interesting to know if the version has changed in your onbuild image. Is that something you can find out?
This is with dask-kubernetes 0.10. IIUC, it was bumped in 380a4bd (22 days ago)
Ok I may be doing something dumb here. Need to retry with pangeo binder.
Oh, that might explain things :)
Ok so that was definitely dumb. Sorry for the false alarm.
However, things are still not working as they should, even in pangeo binder:
https://binder.pangeo.io/v2/gh/rabernat/RCES-Final_Assignment/patch-2
I can now create a KubeCluster, but I can't connect to it with a client.
client = Client(cluster)
AssertionError
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /srv/conda/envs/notebook/lib/python3.7/asyncio/tasks.py:596> exception=AssertionError()>
Traceback (most recent call last):
File "/srv/conda/envs/notebook/lib/python3.7/asyncio/tasks.py", line 603, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/spec.py", line 51, in _
assert self.status == "running"
AssertionError
etc. etc.
Not quite sure what is going on.
It's failing on KubeCluster.scale()
, probably in .start
@jacobtomlinson in https://github.com/dask/dask-kubernetes/blob/master/dask_kubernetes/core.py#L66-L74, we try to start it 10 times, but catch ApiExceptions. If all 10 fail, it looks like we just exit the function, and then we go back to SpecCluster.__await__
, but self.running
isn't set, so we hit the AssertionError. Shouldn't Pod.start raise an exception if all the tries failed?
Looked a bit more. We do want to surface an error there. I changed it locally, and see
HTTP response headers: <CIMultiDictProxy('Audit-Id': 'b167232c-77ef-4f35-b82d-dee48dfc7d62', 'Content-Type': 'application/json', 'Date': 'Thu, 19 Dec 2019 21:00:20 GMT', 'Transfer-Encoding': 'chunked')>
{
"kind": "Status",
"apiVersion": "v1",
"metadata": {},
"status": "Failure",
"message": "Pod \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\" is invalid: [metadata.generateName: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*'), metadata.name: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*'), spec.containers[0].name: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g\": a DNS-1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')]",
"reason": "Invalid",
"details": {
"name": "dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb",
"kind": "Pod",
"causes": [
{
"reason": "FieldValueInvalid",
"message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*')",
"field": "metadata.generateName"
},
{
"reason": "FieldValueInvalid",
"message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*')",
"field": "metadata.name"
},
{
"reason": "FieldValueInvalid",
"message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g\": a DNS-1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')",
"field": "spec.containers[0].name"
}
]
},
"code": 422
}
So... @rabernat pick a different name for your GitHub repo to not have an underscore? Is it as silly as that? Obviously we'll need to update whatever bit of code is generating that name.
Yeah, that was probably it.
>>> cluster = KubeCluster(name="dask-rabernat-rces")
>>> spec = cluster.pod_template
>>> spec.spec.containers[0].name = 'dask-rabernat-rces-final-assignment-x3rmds1g' # changed the '_' to '-'
>>> cluster.scale(1)
And I see the container creating in kubernetes now.
I'll open some issues on dask-kubernetes.