lhoestq HF staff commited on
Commit
184f807
·
1 Parent(s): e417e74

add python code

Browse files
Files changed (1) hide show
  1. app.py +54 -7
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import ast
2
  import glob
 
3
  from itertools import islice
4
  from functools import partial
 
5
  from typing import Optional, Type
6
 
7
  import gradio as gr
@@ -26,6 +28,7 @@ from datatrove.utils.typeshelper import Languages
26
 
27
  nltk.download('punkt_tab')
28
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
 
29
 
30
  make_gallery_image_buttons_js = """
31
  function load() {
@@ -281,9 +284,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
281
  gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
282
  gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
283
 
284
- with gr.Row():
285
- view_pipeline_results_button = gr.Button("Run Pipeline & Stream Results", variant="primary", scale=4)
286
- stop_button = gr.Button("Stop")
287
 
288
  steps = [
289
  URLFilter,
@@ -340,7 +341,6 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
340
  pii_removal_checkbox
341
  ] + sum(steps_parameters_components, [])
342
 
343
- @view_pipeline_results_button.click(inputs=inputs, outputs=[output_tab, output_dataframe, excluded_tab] + list(excluded_dataframes.values()) + list(excluded_tabs.values()))
344
  def view_pipeline_results(*args):
345
  enable_steps, steps_parameters = args[:len(steps)], args[len(steps):]
346
  steps_parameters_iter = iter(steps_parameters)
@@ -358,6 +358,43 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
358
  }
359
  for step_parameters_components in steps_parameters_components
360
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  class ExclusionWriter:
363
 
@@ -380,19 +417,28 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
380
  ]
381
  output_docs: list[Document] = []
382
  num_warc_samples = 0
 
383
 
384
  def increment_num_warc_samples(data, rank, world_size, num_warc_samples_per_doc=1):
385
  nonlocal num_warc_samples
386
  for x in data:
387
  num_warc_samples += num_warc_samples_per_doc
388
  yield x
 
 
 
 
 
 
 
389
 
390
  if steps_parameters[:2] == default_steps_parameters[:2] and all(enable_steps[:2]):
391
 
392
  pipeline_executor = LocalPipelineExecutor(
393
  pipeline=[
394
  JsonlReader(data_folder=f"output_text_extraction-full/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz"),
395
- partial(increment_num_warc_samples, num_warc_samples_per_doc=2000 / 1687)
 
396
  ] + steps_to_run[2:] + [
397
  lambda data, rank, world_size: islice(data, 100),
398
  lambda data, rank, world_size: map(output_docs.append, data)
@@ -404,7 +450,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
404
  pipeline_executor = LocalPipelineExecutor(
405
  pipeline=[
406
  WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
407
- increment_num_warc_samples
 
408
  ] + steps_to_run + [
409
  lambda data, rank, world_size: islice(data, 100),
410
  lambda data, rank, world_size: map(output_docs.append, data)
@@ -465,7 +512,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
465
  },
466
  }
467
 
468
- stop_button.click(cancels=[view_pipeline_results])
469
 
470
  if __name__ == "__main__":
471
  demo.launch()
 
1
  import ast
2
  import glob
3
+ import time
4
  from itertools import islice
5
  from functools import partial
6
+ from textwrap import dedent
7
  from typing import Optional, Type
8
 
9
  import gradio as gr
 
28
 
29
  nltk.download('punkt_tab')
30
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
31
+ TIMEOUT = 600
32
 
33
  make_gallery_image_buttons_js = """
34
  function load() {
 
284
  gopher_filtering_quality_checkbox.change(lambda visible: gr.Accordion(visible=visible), inputs=gopher_filtering_quality_checkbox, outputs=acc)
285
  gopher_filtering_quality_parameters_components = [language_dropdown2, min_doc_words_slider, max_doc_words_slider, min_avg_word_length_slider, max_avg_word_length_slider, max_symbol_word_ratio_slider, max_bullet_lines_ratio_slider, max_ellipsis_lines_ratio_slider, max_non_alpha_words_ratio_slider, min_stop_words_slider, stop_words_textbox]
286
 
287
+ view_pipeline_results_button = gr.Button("Run Pipeline & Stream Results", variant="primary", scale=4)
 
 
288
 
289
  steps = [
290
  URLFilter,
 
341
  pii_removal_checkbox
342
  ] + sum(steps_parameters_components, [])
343
 
 
344
  def view_pipeline_results(*args):
345
  enable_steps, steps_parameters = args[:len(steps)], args[len(steps):]
346
  steps_parameters_iter = iter(steps_parameters)
 
358
  }
359
  for step_parameters_components in steps_parameters_components
360
  ]
361
+ yield {
362
+ python_code_markdown: dedent(
363
+ """
364
+ ```python
365
+ from datatrove.executor.local import LocalPipelineExecutor
366
+ from datatrove.pipeline.extractors import Trafilatura
367
+ from datatrove.pipeline.filters import (
368
+ C4QualityFilter,
369
+ FineWebQualityFilter,
370
+ GopherQualityFilter,
371
+ GopherRepetitionFilter,
372
+ LanguageFilter,
373
+ URLFilter,
374
+ )
375
+ from datatrove.pipeline.formatters import PIIFormatter
376
+ from datatrove.pipeline.readers import WarcReader
377
+ """
378
+ ).strip() + (
379
+ "\n\n"
380
+ "pipeline_executor = LocalPipelineExecutor(\n"
381
+ " pipeline=[\n"
382
+ f' WarcReader("s3://commoncrawl/crawl-data/{DUMP_TO_PROCESS}/segments", glob_pattern="*/warc/*"),\n'
383
+ ) + ",\n".join([
384
+ " " + step.__name__ + "(" + ", ".join(arg + "=" + str(value) for arg, value in step_parameters.items() if value != default_step_parameters[arg] and arg != "exclusion_writer") + ")"
385
+ for step, step_parameters, default_step_parameters, enable_step in zip(steps, steps_parameters, default_steps_parameters, enable_steps)
386
+ if enable_step
387
+ ]) + (
388
+ "\n"
389
+ " ]\n"
390
+ ")"
391
+ ) + dedent(
392
+ """
393
+ pipeline_executor.run()
394
+ ```
395
+ """
396
+ )
397
+ }
398
 
399
  class ExclusionWriter:
400
 
 
417
  ]
418
  output_docs: list[Document] = []
419
  num_warc_samples = 0
420
+ timeout_time = time.time() + TIMEOUT
421
 
422
  def increment_num_warc_samples(data, rank, world_size, num_warc_samples_per_doc=1):
423
  nonlocal num_warc_samples
424
  for x in data:
425
  num_warc_samples += num_warc_samples_per_doc
426
  yield x
427
+
428
+ def check_timeout(data, rank, world_size):
429
+ for x in data:
430
+ if time.time() > timeout_time:
431
+ gr.Info("Pipeline timed out")
432
+ break
433
+ yield x
434
 
435
  if steps_parameters[:2] == default_steps_parameters[:2] and all(enable_steps[:2]):
436
 
437
  pipeline_executor = LocalPipelineExecutor(
438
  pipeline=[
439
  JsonlReader(data_folder=f"output_text_extraction-full/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz"),
440
+ partial(increment_num_warc_samples, num_warc_samples_per_doc=2000 / 1687),
441
+ check_timeout
442
  ] + steps_to_run[2:] + [
443
  lambda data, rank, world_size: islice(data, 100),
444
  lambda data, rank, world_size: map(output_docs.append, data)
 
450
  pipeline_executor = LocalPipelineExecutor(
451
  pipeline=[
452
  WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
453
+ increment_num_warc_samples,
454
+ check_timeout
455
  ] + steps_to_run + [
456
  lambda data, rank, world_size: islice(data, 100),
457
  lambda data, rank, world_size: map(output_docs.append, data)
 
512
  },
513
  }
514
 
515
+ view_pipeline_results_button.click(view_pipeline_results, inputs=inputs, outputs=[output_tab, output_dataframe, excluded_tab, python_code_markdown] + list(excluded_dataframes.values()) + list(excluded_tabs.values()))
516
 
517
  if __name__ == "__main__":
518
  demo.launch()