victormiller commited on
Commit
0bc171c
·
verified ·
1 Parent(s): beddb3a

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +65 -8
curated.py CHANGED
@@ -514,6 +514,63 @@ freelaw_examples = Div(
514
  ),
515
  )
516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  filtering_process = Div(
518
  Section(
519
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -643,10 +700,10 @@ filtering_process = Div(
643
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
644
  ),
645
  table_div_phil,
646
- # Details(
647
- # Summary("Phil Papers Filtering Examples"),
648
- # phil_examples,
649
- # ),
650
  ),
651
  ),
652
  Section(
@@ -751,10 +808,10 @@ filtering_process = Div(
751
  Li("Minimum Word Count Filter: 10"),
752
  ),
753
  table_div_se,
754
- # Details(
755
- # Summary("StackExchange Filtering Examples"),
756
- # se_examples,
757
- # ),
758
  ),
759
  ),
760
  Section(
 
514
  ),
515
  )
516
 
517
+ def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
518
+ doc_id = max(0, min(int(doc_id), 9))
519
+
520
+ if data_source == "StackExchange":
521
+ raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
522
+ extracted_sample_doc = json.load(
523
+ open("data/curated_samples/stackexchange_extract.json")
524
+ )
525
+ else:
526
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
527
+
528
+ raw_json = raw_sample_doc[doc_id]
529
+ extracted_json = extracted_sample_doc[doc_id]
530
+ return view_data(
531
+ raw_json,
532
+ extracted_json,
533
+ doc_id=doc_id,
534
+ data_source="StackExchange",
535
+ data_sources="StackExchange",
536
+ target=target,
537
+ )
538
+
539
+ se_examples = Div(
540
+ Div(
541
+ get_se_data(target=gen_random_id()),
542
+ style="border: 1px solid #ccc; padding: 20px;",
543
+ ),
544
+ )
545
+
546
+ def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
547
+ doc_id = max(0, min(int(doc_id), 9))
548
+
549
+ if data_source == "PhilPapers":
550
+ raw_sample_doc = extracted_sample_doc = json.load(
551
+ open("data/curated_samples/philpapers_raw.json")
552
+ )
553
+ else:
554
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
555
+
556
+ raw_json = raw_sample_doc[doc_id]
557
+ extracted_json = extracted_sample_doc[doc_id]
558
+ return view_data(
559
+ raw_json,
560
+ extracted_json,
561
+ doc_id=doc_id,
562
+ data_source="PhilPapers",
563
+ data_sources="PhilPapers",
564
+ target=target,
565
+ )
566
+
567
+ phil_examples = Div(
568
+ Div(
569
+ get_phil_data(target=gen_random_id()),
570
+ style="border: 1px solid #ccc; padding: 20px;",
571
+ ),
572
+ )
573
+
574
  filtering_process = Div(
575
  Section(
576
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
 
700
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
701
  ),
702
  table_div_phil,
703
+ Details(
704
+ Summary("Phil Papers Filtering Examples"),
705
+ phil_examples,
706
+ ),
707
  ),
708
  ),
709
  Section(
 
808
  Li("Minimum Word Count Filter: 10"),
809
  ),
810
  table_div_se,
811
+ Details(
812
+ Summary("StackExchange Filtering Examples"),
813
+ se_examples,
814
+ ),
815
  ),
816
  ),
817
  Section(