pangeo-data/pangeo-stacks

KubeCluster broken in latest image

Opened this issue · 10 comments

Using the latest onbuild notebook (pangeo/pangeo-notebook-onbuild:2019.12.08), I can't start a KubeCluster

from dask_kubernetes import KubeCluster
cluster = KubeCluster()
TypeError: argument of type 'NoneType' is not iterable

For an example, see https://mybinder.org/v2/gh/rabernat/RCES-Final_Assignment/patch-2

Full traceback

/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/core.py in __init__(self, pod_template, name, namespace, n_workers, host, port, env, auth, scheduler_timeout, deploy_mode, interface, protocol, dashboard_address, security, **kwargs)
    378         self.auth = auth
    379         self.kwargs = kwargs
--> 380         super().__init__(**self.kwargs)
    381 
    382     async def _start(self):

/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
    240         if not self.asynchronous:
    241             self._loop_runner.start()
--> 242             self.sync(self._start)
    243             self.sync(self._correct_state)
    244 

/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    160             return future
    161         else:
--> 162             return sync(self.loop, func, *args, **kwargs)
    163 
    164     async def _logs(self, scheduler=True, workers=True):

/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    332     if error[0]:
    333         typ, exc, tb = error[0]
--> 334         raise exc.with_traceback(tb)
    335     else:
    336         return result[0]

/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/utils.py in f()
    316             if callback_timeout is not None:
    317                 future = gen.with_timeout(timedelta(seconds=callback_timeout), future)
--> 318             result[0] = yield future
    319         except Exception as exc:
    320             error[0] = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.7/site-packages/tornado/gen.py in run(self)
    733 
    734                     try:
--> 735                         value = future.result()
    736                     except Exception:
    737                         exc_info = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/core.py in _start(self)
    435 
    436         self.pod_template = clean_pod_template(self.pod_template)
--> 437         await ClusterAuth.load_first(self.auth)
    438 
    439         self.core_api = kubernetes.client.CoreV1Api()

/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/auth.py in load_first(auth)
     68         for auth_instance in auth:
     69             try:
---> 70                 await auth_instance.load()
     71             except kubernetes.config.ConfigException as exc:
     72                 logger.debug(

/srv/conda/envs/notebook/lib/python3.7/site-packages/dask_kubernetes/auth.py in load(self)
    119     async def load(self):
    120         await kubernetes.config.load_kube_config(
--> 121             self.config_file, self.context, None, self.persist_config
    122         )
    123 

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in load_kube_config(config_file, context, client_configuration, persist_config)
    551     loader = _get_kube_config_loader_for_yaml_file(
    552         config_file, active_context=context,
--> 553         persist_config=persist_config)
    554     if client_configuration is None:
    555         config = type.__call__(Configuration)

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in _get_kube_config_loader_for_yaml_file(filename, persist_config, **kwargs)
    519         config_dict=kcfg.config,
    520         config_base_path=None,
--> 521         **kwargs)
    522 
    523 

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in __init__(self, config_dict, active_context, get_google_credentials, config_base_path, config_persister)
    142         self._cluster = None
    143         self.provider = None
--> 144         self.set_active_context(active_context)
    145         self._config_base_path = config_base_path
    146         self._config_persister = config_persister

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in set_active_context(self, context_name)
    152     def set_active_context(self, context_name=None):
    153         if context_name is None:
--> 154             context_name = self._config['current-context']
    155         self._current_context = self._config['contexts'].get_with_name(
    156             context_name)

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in __getitem__(self, key)
    403 
    404     def __getitem__(self, key):
--> 405         v = self.safe_get(key)
    406         if not v:
    407             raise ConfigException(

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes_asyncio/config/kube_config.py in safe_get(self, key)
    399     def safe_get(self, key):
    400         if (isinstance(self.value, list) and isinstance(key, int) or
--> 401                 key in self.value):
    402             return self.value[key]
    403 

TypeError: argument of type 'NoneType' is not iterable

Looks like dask/dask-kubernetes#165. Looking into it now

One difference, on ocean.pangeo.io, we have this

kubernetes.config.load_incluster_config()  # passes fine

But on the binder, we have

>>> kubernetes.config.load_incluster_config()
ConfigException                           Traceback (most recent call last)
<ipython-input-7-bb26eae40ec2> in <module>
----> 1 kubernetes.config.load_incluster_config()

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in load_incluster_config()
     94     not running in a kubernetes environment."""
     95     InClusterConfigLoader(token_filename=SERVICE_TOKEN_FILENAME,
---> 96                           cert_filename=SERVICE_CERT_FILENAME).load_and_set()

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in load_and_set(self)
     45 
     46     def load_and_set(self):
---> 47         self._load_config()
     48         self._set_config()
     49 

/srv/conda/envs/notebook/lib/python3.7/site-packages/kubernetes/config/incluster_config.py in _load_config(self)
     62 
     63         if not os.path.isfile(self._token_filename):
---> 64             raise ConfigException("Service token file does not exists.")
     65 
     66         with open(self._token_filename) as f:

ConfigException: Service token file does not exists.

This is straining my kubernetes knowledge. @jacobtomlinson do you have any thoughts here, and maybe @jhamman knows a bit more about the deployment differences between the binder and the pangeo-cloud-federation deployments?

Kubernetes clients generally authenticate with a local config in your home directory, or a service account config which gets mounted into a pod automatically.

The example from @TomAugspurger suggests that you are expecting a service account to be mounted in the pod, but one doesn't exist.

@rabernat's example seems to suggest that neither exists as dask-kubernetes tries both by default.

My gut feeling would be that this is a bug in dask-kubernetes rather than a cluster config issue. It would be interesting to know if the version has changed in your onbuild image. Is that something you can find out?

This is with dask-kubernetes 0.10. IIUC, it was bumped in 380a4bd (22 days ago)

Ok I may be doing something dumb here. Need to retry with pangeo binder.

Oh, that might explain things :)

Ok so that was definitely dumb. Sorry for the false alarm.

However, things are still not working as they should, even in pangeo binder:

https://binder.pangeo.io/v2/gh/rabernat/RCES-Final_Assignment/patch-2

I can now create a KubeCluster, but I can't connect to it with a client.

client = Client(cluster)
AssertionError
Task exception was never retrieved
future: <Task finished coro=<_wrap_awaitable() done, defined at /srv/conda/envs/notebook/lib/python3.7/asyncio/tasks.py:596> exception=AssertionError()>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.7/asyncio/tasks.py", line 603, in _wrap_awaitable
    return (yield from awaitable.__await__())
  File "/srv/conda/envs/notebook/lib/python3.7/site-packages/distributed/deploy/spec.py", line 51, in _
    assert self.status == "running"
AssertionError

etc. etc.

Not quite sure what is going on.

It's failing on KubeCluster.scale(), probably in .start

@jacobtomlinson in https://github.com/dask/dask-kubernetes/blob/master/dask_kubernetes/core.py#L66-L74, we try to start it 10 times, but catch ApiExceptions. If all 10 fail, it looks like we just exit the function, and then we go back to SpecCluster.__await__, but self.running isn't set, so we hit the AssertionError. Shouldn't Pod.start raise an exception if all the tries failed?

Looked a bit more. We do want to surface an error there. I changed it locally, and see

HTTP response headers: <CIMultiDictProxy('Audit-Id': 'b167232c-77ef-4f35-b82d-dee48dfc7d62', 'Content-Type': 'application/json', 'Date': 'Thu, 19 Dec 2019 21:00:20 GMT', 'Transfer-Encoding': 'chunked')>

{
  "kind": "Status",
  "apiVersion": "v1",
  "metadata": {},
  "status": "Failure",
  "message": "Pod \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\" is invalid: [metadata.generateName: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*'), metadata.name: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*'), spec.containers[0].name: Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g\": a DNS-1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')]",
  "reason": "Invalid",
  "details": {
    "name": "dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb",
    "kind": "Pod",
    "causes": [
      {
        "reason": "FieldValueInvalid",
        "message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*')",
        "field": "metadata.generateName"
      },
      {
        "reason": "FieldValueInvalid",
        "message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g-7f997e55-3fjgvb\": a DNS-1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*')",
        "field": "metadata.name"
      },
      {
        "reason": "FieldValueInvalid",
        "message": "Invalid value: \"dask-rabernat-rces-final_assignment-x3rmds1g\": a DNS-1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')",
        "field": "spec.containers[0].name"
      }
    ]
  },
  "code": 422
}

So... @rabernat pick a different name for your GitHub repo to not have an underscore? Is it as silly as that? Obviously we'll need to update whatever bit of code is generating that name.

Yeah, that was probably it.

>>> cluster = KubeCluster(name="dask-rabernat-rces")
>>> spec = cluster.pod_template
>>> spec.spec.containers[0].name = 'dask-rabernat-rces-final-assignment-x3rmds1g'  # changed the '_' to '-'
>>> cluster.scale(1)

And I see the container creating in kubernetes now.

I'll open some issues on dask-kubernetes.