can't run the notebook locally
sylvain471 opened this issue · 2 comments
Hello, very interested with this work I am trying to run it locally.
However I am stuck at the cell
# Extract sections
sections_ds = ds.flat_map(extract_sections)
sections_ds.count()
sections_ds.count()
throws the following error, any idea about what may solve this issue?
{
"name": "RayTaskError(FileNotFoundError)",
"message": "ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
for b_out in map_transformer.apply_transform(iter(blocks), ctx):
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
for data in iter:
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
yield from self._row_fn(input, ctx)
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
for out_row in fn(row):
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
return op_fn(item, *fn_args, **fn_kwargs)
File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'",
"stack": "---------------------------------------------------------------------------
ObjectRefStreamEndOfStreamError Traceback (most recent call last)
File python/ray/_raylet.pyx:345, in ray._raylet.StreamingObjectRefGenerator._next_sync()
File python/ray/_raylet.pyx:4533, in ray._raylet.CoreWorker.try_read_next_object_ref_stream()
File python/ray/_raylet.pyx:443, in ray._raylet.check_status()
ObjectRefStreamEndOfStreamError:
During handling of the above exception, another exception occurred:
StopIteration Traceback (most recent call last)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:80, in DataOpTask.on_waitable_ready(self)
79 try:
---> 80 meta = ray.get(next(self._streaming_gen))
81 except StopIteration:
82 # The generator should always yield 2 values (block and metadata)
83 # each time. If we get a StopIteration here, it means an error
(...)
86 # TODO(hchen): Ray Core should have a better interface for
87 # detecting and obtaining the exception.
File python/ray/_raylet.pyx:300, in ray._raylet.StreamingObjectRefGenerator.__next__()
File python/ray/_raylet.pyx:351, in ray._raylet.StreamingObjectRefGenerator._next_sync()
StopIteration:
During handling of the above exception, another exception occurred:
RayTaskError(FileNotFoundError) Traceback (most recent call last)
/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb Cell 20 line 4
<a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a> # Extract sections
<a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=1'>2</a> #ray.data.DataContext.get_current().execution_options.verbose_progress = True
<a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=2'>3</a> sections_ds = ds.flat_map(extract_sections)
----> <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=3'>4</a> sections_ds.count()
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:2498, in Dataset.count(self)
2492 return meta_count
2494 get_num_rows = cached_remote_fn(_get_num_rows)
2496 return sum(
2497 ray.get(
-> 2498 [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
2499 )
2500 )
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:4799, in Dataset.get_internal_block_refs(self)
4780 @ConsumptionAPI(pattern=\"Time complexity:\")
4781 @DeveloperAPI
4782 def get_internal_block_refs(self) -> List[ObjectRef[Block]]:
4783 \"\"\"Get a list of references to the underlying blocks of this dataset.
4784
4785 This function can be used for zero-copy access to the data. It blocks
(...)
4797 A list of references to this dataset's blocks.
4798 \"\"\"
-> 4799 blocks = self._plan.execute().get_blocks()
4800 self._synchronize_progress_bar()
4801 return blocks
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/plan.py:591, in ExecutionPlan.execute(self, allow_clear_input_blocks, force_read, preserve_order)
589 else:
590 executor = BulkExecutor(copy.deepcopy(context.execution_options))
--> 591 blocks = execute_to_legacy_block_list(
592 executor,
593 self,
594 allow_clear_input_blocks=allow_clear_input_blocks,
595 dataset_uuid=self._dataset_uuid,
596 preserve_order=preserve_order,
597 )
598 # TODO(ekl) we shouldn't need to set this in the future once we move
599 # to a fully lazy execution model, unless .materialize() is used. Th
600 # reason we need it right now is since the user may iterate over a
601 # Dataset multiple times after fully executing it once.
602 if not self._run_by_consumer:
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:119, in execute_to_legacy_block_list(executor, plan, allow_clear_input_blocks, dataset_uuid, preserve_order)
112 dag, stats = _get_execution_dag(
113 executor,
114 plan,
115 allow_clear_input_blocks,
116 preserve_order,
117 )
118 bundles = executor.execute(dag, initial_stats=stats)
--> 119 block_list = _bundles_to_block_list(bundles)
120 # Set the stats UUID after execution finishes.
121 _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:357, in _bundles_to_block_list(bundles)
355 blocks, metadata = [], []
356 owns_blocks = True
--> 357 for ref_bundle in bundles:
358 if not ref_bundle.owns_blocks:
359 owns_blocks = False
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/executor.py:37, in OutputIterator.__next__(self)
36 def __next__(self) -> RefBundle:
---> 37 return self.get_next()
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:129, in StreamingExecutor.execute.<locals>.StreamIterator.get_next(self, output_split_idx)
127 raise StopIteration
128 elif isinstance(item, Exception):
--> 129 raise item
130 else:
131 # Otherwise return a concrete RefBundle.
132 if self._outer._global_info:
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:187, in StreamingExecutor.run(self)
181 \"\"\"Run the control loop in a helper thread.
182
183 Results are returned via the output node's outqueue.
184 \"\"\"
185 try:
186 # Run scheduling loop until complete.
--> 187 while self._scheduling_loop_step(self._topology) and not self._shutdown:
188 pass
189 except Exception as e:
190 # Propagate it to the result iterator.
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:235, in StreamingExecutor._scheduling_loop_step(self, topology)
230 logger.get_logger().info(\"Scheduling loop step...\")
232 # Note: calling process_completed_tasks() is expensive since it incurs
233 # ray.wait() overhead, so make sure to allow multiple dispatch per call for
234 # greater parallelism.
--> 235 process_completed_tasks(topology)
237 # Dispatch as many operators as we can for completed tasks.
238 limits = self._get_or_refresh_resource_limits()
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py:333, in process_completed_tasks(topology)
326 ready, _ = ray.wait(
327 list(active_tasks.keys()),
328 num_returns=len(active_tasks),
329 fetch_local=False,
330 timeout=0.1,
331 )
332 for ref in ready:
--> 333 active_tasks[ref].on_waitable_ready()
335 # Pull any operator outputs into the streaming op state.
336 for op, op_state in topology.items():
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:88, in DataOpTask.on_waitable_ready(self)
80 meta = ray.get(next(self._streaming_gen))
81 except StopIteration:
82 # The generator should always yield 2 values (block and metadata)
83 # each time. If we get a StopIteration here, it means an error
(...)
86 # TODO(hchen): Ray Core should have a better interface for
87 # detecting and obtaining the exception.
---> 88 ex = ray.get(block_ref)
89 self._task_done_callback()
90 raise ex
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
21 @wraps(fn)
22 def auto_init_wrapper(*args, **kwargs):
23 auto_init_ray()
---> 24 return fn(*args, **kwargs)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
101 if func.__name__ != \"init\" or is_client_mode_enabled_by_default:
102 return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/worker.py:2547, in get(object_refs, timeout)
2545 worker.core_worker.dump_object_store_memory_usage()
2546 if isinstance(value, RayTaskError):
-> 2547 raise value.as_instanceof_cause()
2548 else:
2549 raise value
RayTaskError(FileNotFoundError): ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
for b_out in map_transformer.apply_transform(iter(blocks), ctx):
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
for data in iter:
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
yield from self._row_fn(input, ctx)
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
for out_row in fn(row):
File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
return op_fn(item, *fn_args, **fn_kwargs)
File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'"
}```
Download the dataset using this command on your local machine.
wget -e robots=off --recursive --no-clobber --page-requisites \ --html-extension --convert-links --restrict-file-names=windows \ --domains docs.ray.io --no-parent --accept=html \ -P $EFS_DIR https://docs.ray.io/en/master/
Hi, I'm running into exact same issue. When running the command for
wget -e robots=off --recursive --no-clobber --page-requisites \ --html-extension --convert-links --restrict-file-names=windows \ --domains docs.ray.io --no-parent --accept=html \ -P $EFS_DIR https://docs.ray.io/en/master/
I'm getting same issue as ray-project/ray#26320 so I had to set $EFS_DIR to ../data
instead of /mnt/shared_storage/ray-assistant-data
, because of this issue: #100
Even with this workaround, I'm still getting issues with running the same line in notebook:
sections_ds.count()
{
"name": "RayTaskError(UserCodeException)",
"message": "ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/tmp/ray/session_2024-04-24_20-30-47_848902_41459/runtime_resources/working_dir_files/_ray_pkg_82dd1b31f4f4a613/rag/data.py\", line 26, in extract_sections
with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '../data/docs.ray.io/en/master/joblib.html'
The above exception was the direct cause of the following exception:
ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 419, in _map_task
for b_out in map_transformer.apply_transform(iter(blocks), ctx):
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 392, in __call__
for data in iter:
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 134, in _udf_timed_iter
output = next(input)
^^^^^^^^^^^
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 216, in __call__
yield from self._row_fn(input, ctx)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 264, in transform_fn
for out_row in fn(row):
^^^^^^^
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 127, in fn
_handle_debugger_exception(e)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 143, in _handle_debugger_exception
raise UserCodeException() from e
ray.exceptions.UserCodeException",
"stack": "---------------------------------------------------------------------------
RayTaskError(UserCodeException) Traceback (most recent call last)
Cell In[25], line 3
1 # Extract sections
2 sections_ds = ds.flat_map(extract_sections)
----> 3 sections_ds.count()
File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/dataset.py:2488, in Dataset.count(self)
2482 return meta_count
2484 get_num_rows = cached_remote_fn(_get_num_rows)
2486 return sum(
2487 ray.get(
-> 2488 [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
2489 )
2490 )
File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/dataset.py:4631, in Dataset.get_internal_block_refs(self)
4612 @ConsumptionAPI(pattern=\"Time complexity:\")
4613 @DeveloperAPI
4614 def get_internal_block_refs(self) -> List[ObjectRef[Block]]:
4615 \"\"\"Get a list of references to the underlying blocks of this dataset.
4616
4617 This function can be used for zero-copy access to the data. It blocks
(...)
4629 A list of references to this dataset's blocks.
4630 \"\"\"
-> 4631 blocks = self._plan.execute().get_blocks()
4632 self._synchronize_progress_bar()
4633 return blocks
File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/exceptions.py:84, in omit_traceback_stdout.<locals>.handle_trace(*args, **kwargs)
80 logger.exception(
81 \"Full stack trace:\", exc_info=True, extra={\"hide\": not log_to_stdout}
82 )
83 if is_user_code_exception:
---> 84 raise e.with_traceback(None)
85 else:
86 raise e.with_traceback(None) from SystemException()
RayTaskError(UserCodeException): ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/tmp/ray/session_2024-04-24_20-30-47_848902_41459/runtime_resources/working_dir_files/_ray_pkg_82dd1b31f4f4a613/rag/data.py\", line 26, in extract_sections
with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '../data/docs.ray.io/en/master/joblib.html'
The above exception was the direct cause of the following exception:
ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 419, in _map_task
for b_out in map_transformer.apply_transform(iter(blocks), ctx):
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 392, in __call__
for data in iter:
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 134, in _udf_timed_iter
output = next(input)
^^^^^^^^^^^
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 216, in __call__
yield from self._row_fn(input, ctx)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 264, in transform_fn
for out_row in fn(row):
^^^^^^^
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 127, in fn
_handle_debugger_exception(e)
File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 143, in _handle_debugger_exception
raise UserCodeException() from e
ray.exceptions.UserCodeException"
}