oceansweep commited on
Commit
7a8281e
·
verified ·
1 Parent(s): fa80e45

Update App_Function_Libraries/Chunk_Lib.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Chunk_Lib.py +586 -586
App_Function_Libraries/Chunk_Lib.py CHANGED
@@ -1,587 +1,587 @@
1
- # Chunk_Lib.py
2
- #########################################
3
- # Chunking Library
4
- # This library is used to perform chunking of input files.
5
- # Currently, uses naive approaches. Nothing fancy.
6
- #
7
- ####
8
- # Import necessary libraries
9
- import logging
10
- import re
11
-
12
- from typing import List, Optional, Tuple, Dict, Any
13
-
14
- from openai import OpenAI
15
- from tqdm import tqdm
16
- #
17
- # Import 3rd party
18
- from transformers import GPT2Tokenizer
19
- import nltk
20
- from nltk.tokenize import sent_tokenize, word_tokenize
21
- from sklearn.feature_extraction.text import TfidfVectorizer
22
- from sklearn.metrics.pairwise import cosine_similarity
23
- #
24
- # Import Local
25
- from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
- from App_Function_Libraries.Utils import load_comprehensive_config
27
-
28
-
29
- #
30
- #######################################################################################################################
31
- # Function Definitions
32
- #
33
-
34
- # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
- # Ensure NLTK data is downloaded
36
- def ntlk_prep():
37
- nltk.download('punkt')
38
-
39
- # Load GPT2 tokenizer
40
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
-
42
- # Load Config file for API keys
43
- config = load_comprehensive_config()
44
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
-
46
- def load_document(file_path):
47
- with open(file_path, 'r') as file:
48
- text = file.read()
49
- return re.sub('\\s+', ' ', text).strip()
50
-
51
-
52
- def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
53
- chunk_method = chunk_options.get('method', 'words')
54
- max_chunk_size = chunk_options.get('max_size', 300)
55
- overlap = chunk_options.get('overlap', 0)
56
- language = chunk_options.get('language', 'english')
57
- adaptive = chunk_options.get('adaptive', False)
58
- multi_level = chunk_options.get('multi_level', False)
59
-
60
- if adaptive:
61
- max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
62
-
63
- if multi_level:
64
- chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
65
- else:
66
- if chunk_method == 'words':
67
- chunks = chunk_text_by_words(text, max_chunk_size, overlap)
68
- elif chunk_method == 'sentences':
69
- chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
70
- elif chunk_method == 'paragraphs':
71
- chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
72
- elif chunk_method == 'tokens':
73
- chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
74
- elif chunk_method == 'chapters':
75
- return chunk_ebook_by_chapters(text, chunk_options)
76
- else:
77
- # No chunking applied
78
- chunks = [text]
79
-
80
- return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
81
-
82
-
83
- def adaptive_chunk_size(text: str, base_size: int) -> int:
84
- # Simple adaptive logic: adjust chunk size based on text complexity
85
- avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
86
- if avg_word_length > 6: # Arbitrary threshold for "complex" text
87
- return int(base_size * 0.8) # Reduce chunk size for complex text
88
- return base_size
89
-
90
-
91
- def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
92
- # First level: chunk by paragraphs
93
- paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
94
-
95
- # Second level: chunk each paragraph further
96
- chunks = []
97
- for para in paragraphs:
98
- if method == 'words':
99
- chunks.extend(chunk_text_by_words(para, max_size, overlap))
100
- elif method == 'sentences':
101
- chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
102
- else:
103
- chunks.append(para)
104
-
105
- return chunks
106
-
107
-
108
- def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
109
- words = text.split()
110
- chunks = []
111
- for i in range(0, len(words), max_words - overlap):
112
- chunk = ' '.join(words[i:i + max_words])
113
- chunks.append(chunk)
114
- return post_process_chunks(chunks)
115
-
116
-
117
- def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
118
- str]:
119
- nltk.download('punkt', quiet=True)
120
- sentences = nltk.sent_tokenize(text, language=language)
121
- chunks = []
122
- for i in range(0, len(sentences), max_sentences - overlap):
123
- chunk = ' '.join(sentences[i:i + max_sentences])
124
- chunks.append(chunk)
125
- return post_process_chunks(chunks)
126
-
127
-
128
- def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
129
- paragraphs = re.split(r'\n\s*\n', text)
130
- chunks = []
131
- for i in range(0, len(paragraphs), max_paragraphs - overlap):
132
- chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
133
- chunks.append(chunk)
134
- return post_process_chunks(chunks)
135
-
136
-
137
- def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
138
- # This is a simplified token-based chunking. For more accurate tokenization,
139
- # consider using a proper tokenizer like GPT-2 TokenizerFast
140
- words = text.split()
141
- chunks = []
142
- current_chunk = []
143
- current_token_count = 0
144
-
145
- for word in words:
146
- word_token_count = len(word) // 4 + 1 # Rough estimate of token count
147
- if current_token_count + word_token_count > max_tokens and current_chunk:
148
- chunks.append(' '.join(current_chunk))
149
- current_chunk = current_chunk[-overlap:] if overlap > 0 else []
150
- current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
151
-
152
- current_chunk.append(word)
153
- current_token_count += word_token_count
154
-
155
- if current_chunk:
156
- chunks.append(' '.join(current_chunk))
157
-
158
- return post_process_chunks(chunks)
159
-
160
-
161
- def post_process_chunks(chunks: List[str]) -> List[str]:
162
- return [chunk.strip() for chunk in chunks if chunk.strip()]
163
-
164
-
165
- def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
166
- try:
167
- start_index = full_text.index(chunk)
168
- metadata = {
169
- 'start_index': start_index,
170
- 'end_index': start_index + len(chunk),
171
- 'word_count': len(chunk.split()),
172
- 'char_count': len(chunk),
173
- 'chunk_type': chunk_type
174
- }
175
- if chunk_type == "chapter":
176
- metadata['chapter_number'] = chapter_number
177
- metadata['chapter_pattern'] = chapter_pattern
178
- return metadata
179
- except ValueError as e:
180
- logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}")
181
- raise
182
-
183
-
184
- # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
185
- def chunk_text_hybrid(text, max_tokens=1000):
186
- sentences = nltk.tokenize.sent_tokenize(text)
187
- chunks = []
188
- current_chunk = []
189
- current_length = 0
190
-
191
- for sentence in sentences:
192
- tokens = tokenizer.encode(sentence)
193
- if current_length + len(tokens) <= max_tokens:
194
- current_chunk.append(sentence)
195
- current_length += len(tokens)
196
- else:
197
- chunks.append(' '.join(current_chunk))
198
- current_chunk = [sentence]
199
- current_length = len(tokens)
200
-
201
- if current_chunk:
202
- chunks.append(' '.join(current_chunk))
203
-
204
- return chunks
205
-
206
- # Thanks openai
207
- def chunk_on_delimiter(input_string: str,
208
- max_tokens: int,
209
- delimiter: str) -> List[str]:
210
- chunks = input_string.split(delimiter)
211
- combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
212
- chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
213
- if dropped_chunk_count > 0:
214
- print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
215
- combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
216
- return combined_chunks
217
-
218
- # ????FIXME
219
- def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
220
- summarized_chunks = []
221
- current_summary = ""
222
-
223
- logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
224
- logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
225
- for i, chunk in enumerate(chunks):
226
- if i == 0:
227
- current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
228
- else:
229
- combined_text = current_summary + "\n\n" + chunk
230
- current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
231
-
232
- summarized_chunks.append(current_summary)
233
-
234
- return summarized_chunks
235
-
236
-
237
- # Sample text for testing
238
- sample_text = """
239
- Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
240
- concerned with the interactions between computers and human language, in particular how to program computers
241
- to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
242
- the contents of documents, including the contextual nuances of the language within them. The technology can then
243
- accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
244
-
245
- Challenges in natural language processing frequently involve speech recognition, natural language understanding,
246
- and natural language generation.
247
-
248
- Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
249
- "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
250
- """
251
-
252
- # Example usage of different chunking methods
253
- # print("Chunking by words:")
254
- # print(chunk_text_by_words(sample_text, max_words=50))
255
- #
256
- # print("\nChunking by sentences:")
257
- # print(chunk_text_by_sentences(sample_text, max_sentences=2))
258
- #
259
- # print("\nChunking by paragraphs:")
260
- # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
261
- #
262
- # print("\nChunking by tokens:")
263
- # print(chunk_text_by_tokens(sample_text, max_tokens=50))
264
- #
265
- # print("\nHybrid chunking:")
266
- # print(chunk_text_hybrid(sample_text, max_tokens=50))
267
-
268
-
269
-
270
- #######################################################################################################################
271
- #
272
- # Experimental Semantic Chunking
273
- #
274
-
275
- # Chunk text into segments based on semantic similarity
276
- def count_units(text, unit='tokens'):
277
- if unit == 'words':
278
- return len(text.split())
279
- elif unit == 'tokens':
280
- return len(word_tokenize(text))
281
- elif unit == 'characters':
282
- return len(text)
283
- else:
284
- raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
285
-
286
-
287
- def semantic_chunking(text, max_chunk_size=2000, unit='words'):
288
- nltk.download('punkt', quiet=True)
289
- sentences = sent_tokenize(text)
290
- vectorizer = TfidfVectorizer()
291
- sentence_vectors = vectorizer.fit_transform(sentences)
292
-
293
- chunks = []
294
- current_chunk = []
295
- current_size = 0
296
-
297
- for i, sentence in enumerate(sentences):
298
- sentence_size = count_units(sentence, unit)
299
- if current_size + sentence_size > max_chunk_size and current_chunk:
300
- chunks.append(' '.join(current_chunk))
301
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
302
- current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
303
- current_size = overlap_size
304
-
305
- current_chunk.append(sentence)
306
- current_size += sentence_size
307
-
308
- if i + 1 < len(sentences):
309
- current_vector = sentence_vectors[i]
310
- next_vector = sentence_vectors[i + 1]
311
- similarity = cosine_similarity(current_vector, next_vector)[0][0]
312
- if similarity < 0.5 and current_size >= max_chunk_size // 2:
313
- chunks.append(' '.join(current_chunk))
314
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
315
- current_chunk = current_chunk[-3:]
316
- current_size = overlap_size
317
-
318
- if current_chunk:
319
- chunks.append(' '.join(current_chunk))
320
-
321
- return chunks
322
-
323
-
324
- def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
325
- try:
326
- with open(file_path, 'r', encoding='utf-8') as file:
327
- content = file.read()
328
-
329
- chunks = semantic_chunking(content, max_chunk_size, overlap)
330
- return chunks
331
- except Exception as e:
332
- logging.error(f"Error chunking text file: {str(e)}")
333
- return None
334
- #######################################################################################################################
335
-
336
-
337
-
338
-
339
-
340
-
341
- #######################################################################################################################
342
- #
343
- # OpenAI Rolling Summarization
344
- #
345
-
346
- client = OpenAI(api_key=openai_api_key)
347
- def get_chat_completion(messages, model='gpt-4-turbo'):
348
- response = client.chat.completions.create(
349
- model=model,
350
- messages=messages,
351
- temperature=0,
352
- )
353
- return response.choices[0].message.content
354
-
355
-
356
- # This function combines text chunks into larger blocks without exceeding a specified token count.
357
- # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
358
- def combine_chunks_with_no_minimum(
359
- chunks: List[str],
360
- max_tokens: int,
361
- chunk_delimiter="\n\n",
362
- header: Optional[str] = None,
363
- add_ellipsis_for_overflow=False,
364
- ) -> Tuple[List[str], List[int]]:
365
- dropped_chunk_count = 0
366
- output = [] # list to hold the final combined chunks
367
- output_indices = [] # list to hold the indices of the final combined chunks
368
- candidate = (
369
- [] if header is None else [header]
370
- ) # list to hold the current combined chunk candidate
371
- candidate_indices = []
372
- for chunk_i, chunk in enumerate(chunks):
373
- chunk_with_header = [chunk] if header is None else [header, chunk]
374
- # FIXME MAKE NOT OPENAI SPECIFIC
375
- if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
376
- print(f"warning: chunk overflow")
377
- if (
378
- add_ellipsis_for_overflow
379
- # FIXME MAKE NOT OPENAI SPECIFIC
380
- and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
381
- ):
382
- candidate.append("...")
383
- dropped_chunk_count += 1
384
- continue # this case would break downstream assumptions
385
- # estimate token count with the current chunk added
386
- # FIXME MAKE NOT OPENAI SPECIFIC
387
- extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
388
- # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
389
- if extended_candidate_token_count > max_tokens:
390
- output.append(chunk_delimiter.join(candidate))
391
- output_indices.append(candidate_indices)
392
- candidate = chunk_with_header # re-initialize candidate
393
- candidate_indices = [chunk_i]
394
- # otherwise keep extending the candidate
395
- else:
396
- candidate.append(chunk)
397
- candidate_indices.append(chunk_i)
398
- # add the remaining candidate to output if it's not empty
399
- if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
400
- output.append(chunk_delimiter.join(candidate))
401
- output_indices.append(candidate_indices)
402
- return output, output_indices, dropped_chunk_count
403
-
404
-
405
- def rolling_summarize(text: str,
406
- detail: float = 0,
407
- model: str = 'gpt-4-turbo',
408
- additional_instructions: Optional[str] = None,
409
- minimum_chunk_size: Optional[int] = 500,
410
- chunk_delimiter: str = ".",
411
- summarize_recursively=False,
412
- verbose=False):
413
- """
414
- Summarizes a given text by splitting it into chunks, each of which is summarized individually.
415
- The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
416
-
417
- Parameters:
418
- - text (str): The text to be summarized.
419
- - detail (float, optional): A value between 0 and 1
420
- indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
421
- detailed summary. Defaults to 0.
422
- - additional_instructions (Optional[str], optional): Additional instructions to provide to the
423
- model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
424
- chunks. Defaults to 500.
425
- - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
426
- - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
427
- - verbose (bool, optional): If True, prints detailed information about the chunking process.
428
- Returns:
429
- - str: The final compiled summary of the text.
430
-
431
- The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
432
- based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
433
- `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
434
- summarization process. The function returns a compiled summary of all chunks.
435
- """
436
-
437
- # check detail is set correctly
438
- assert 0 <= detail <= 1
439
-
440
- # interpolate the number of chunks based to get specified level of detail
441
- max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
442
- min_chunks = 1
443
- num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
444
-
445
- # adjust chunk_size based on interpolated number of chunks
446
- # FIXME MAKE NOT OPENAI SPECIFIC
447
- document_length = len(openai_tokenize(text))
448
- chunk_size = max(minimum_chunk_size, document_length // num_chunks)
449
- text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
450
- if verbose:
451
- print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
452
- # FIXME MAKE NOT OPENAI SPECIFIC
453
- print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
454
-
455
- # set system message - FIXME
456
- system_message_content = "Rewrite this text in summarized form."
457
- if additional_instructions is not None:
458
- system_message_content += f"\n\n{additional_instructions}"
459
-
460
- accumulated_summaries = []
461
- for i, chunk in enumerate(tqdm(text_chunks)):
462
- if summarize_recursively and accumulated_summaries:
463
- # Combine previous summary with current chunk for recursive summarization
464
- combined_text = accumulated_summaries[-1] + "\n\n" + chunk
465
- user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
466
- else:
467
- user_message_content = chunk
468
-
469
- messages = [
470
- {"role": "system", "content": system_message_content},
471
- {"role": "user", "content": user_message_content}
472
- ]
473
-
474
- response = get_chat_completion(messages, model=model)
475
- accumulated_summaries.append(response)
476
-
477
- final_summary = '\n\n'.join(accumulated_summaries)
478
- return final_summary
479
-
480
- #
481
- #
482
- #######################################################################################################################
483
- #
484
- # Ebook Chapter Chunking
485
-
486
-
487
- def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
488
- max_chunk_size = chunk_options.get('max_size', 300)
489
- overlap = chunk_options.get('overlap', 0)
490
- custom_pattern = chunk_options.get('custom_chapter_pattern', None)
491
-
492
- # List of chapter heading patterns to try, in order
493
- chapter_patterns = [
494
- custom_pattern,
495
- r'^#{1,2}\s+', # Markdown style: '# ' or '## '
496
- r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
497
- r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
498
- r'^[A-Z\s]+$' # All caps headings
499
- ]
500
-
501
- chapter_positions = []
502
- used_pattern = None
503
-
504
- for pattern in chapter_patterns:
505
- if pattern is None:
506
- continue
507
- chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
508
- chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
509
- if chapter_positions:
510
- used_pattern = pattern
511
- break
512
-
513
- # If no chapters found, return the entire content as one chunk
514
- if not chapter_positions:
515
- return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
516
-
517
- # Split content into chapters
518
- chunks = []
519
- for i in range(len(chapter_positions)):
520
- start = chapter_positions[i]
521
- end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
522
- chapter = text[start:end]
523
-
524
- # Apply overlap if specified
525
- if overlap > 0 and i > 0:
526
- overlap_start = max(0, start - overlap)
527
- chapter = text[overlap_start:end]
528
-
529
- chunks.append(chapter)
530
-
531
- # Post-process chunks
532
- processed_chunks = post_process_chunks(chunks)
533
-
534
- # Add metadata to chunks
535
- return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
536
- chapter_pattern=used_pattern)}
537
- for i, chunk in enumerate(processed_chunks)]
538
-
539
-
540
- # # Example usage
541
- # if __name__ == "__main__":
542
- # sample_ebook_content = """
543
- # # Chapter 1: Introduction
544
- #
545
- # This is the introduction.
546
- #
547
- # ## Section 1.1
548
- #
549
- # Some content here.
550
- #
551
- # # Chapter 2: Main Content
552
- #
553
- # This is the main content.
554
- #
555
- # ## Section 2.1
556
- #
557
- # More content here.
558
- #
559
- # CHAPTER THREE
560
- #
561
- # This is the third chapter.
562
- #
563
- # 4. Fourth Chapter
564
- #
565
- # This is the fourth chapter.
566
- # """
567
- #
568
- # chunk_options = {
569
- # 'method': 'chapters',
570
- # 'max_size': 500,
571
- # 'overlap': 50,
572
- # 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
573
- # }
574
- #
575
- # chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
576
- #
577
- # for i, chunk in enumerate(chunked_chapters, 1):
578
- # print(f"Chunk {i}:")
579
- # print(chunk['text'])
580
- # print(f"Metadata: {chunk['metadata']}\n")
581
-
582
-
583
-
584
-
585
- #
586
- # End of Chunking Library
587
  #######################################################################################################################
 
1
+ # Chunk_Lib.py
2
+ #########################################
3
+ # Chunking Library
4
+ # This library is used to perform chunking of input files.
5
+ # Currently, uses naive approaches. Nothing fancy.
6
+ #
7
+ ####
8
+ # Import necessary libraries
9
+ import logging
10
+ import re
11
+
12
+ from typing import List, Optional, Tuple, Dict, Any
13
+
14
+ from openai import OpenAI
15
+ from tqdm import tqdm
16
+ #
17
+ # Import 3rd party
18
+ from transformers import GPT2Tokenizer
19
+ import nltk
20
+ from nltk.tokenize import sent_tokenize, word_tokenize
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ #
24
+ # Import Local
25
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
27
+
28
+
29
+ #
30
+ #######################################################################################################################
31
+ # Function Definitions
32
+ #
33
+
34
+ # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
+ # Ensure NLTK data is downloaded
36
+ def ntlk_prep():
37
+ nltk.download('punkt')
38
+
39
+ # Load GPT2 tokenizer
40
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
+
42
+ # Load Config file for API keys
43
+ config = load_comprehensive_config()
44
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
+
46
+ def load_document(file_path):
47
+ with open(file_path, 'r') as file:
48
+ text = file.read()
49
+ return re.sub('\\s+', ' ', text).strip()
50
+
51
+
52
+ def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
53
+ chunk_method = chunk_options.get('method', 'words')
54
+ max_chunk_size = chunk_options.get('max_size', 300)
55
+ overlap = chunk_options.get('overlap', 0)
56
+ language = chunk_options.get('language', 'english')
57
+ adaptive = chunk_options.get('adaptive', False)
58
+ multi_level = chunk_options.get('multi_level', False)
59
+
60
+ if adaptive:
61
+ max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
62
+
63
+ if multi_level:
64
+ chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
65
+ else:
66
+ if chunk_method == 'words':
67
+ chunks = chunk_text_by_words(text, max_chunk_size, overlap)
68
+ elif chunk_method == 'sentences':
69
+ chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
70
+ elif chunk_method == 'paragraphs':
71
+ chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
72
+ elif chunk_method == 'tokens':
73
+ chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
74
+ elif chunk_method == 'chapters':
75
+ return chunk_ebook_by_chapters(text, chunk_options)
76
+ else:
77
+ # No chunking applied
78
+ chunks = [text]
79
+
80
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
81
+
82
+
83
+ def adaptive_chunk_size(text: str, base_size: int) -> int:
84
+ # Simple adaptive logic: adjust chunk size based on text complexity
85
+ avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
86
+ if avg_word_length > 6: # Arbitrary threshold for "complex" text
87
+ return int(base_size * 0.8) # Reduce chunk size for complex text
88
+ return base_size
89
+
90
+
91
+ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
92
+ # First level: chunk by paragraphs
93
+ paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
94
+
95
+ # Second level: chunk each paragraph further
96
+ chunks = []
97
+ for para in paragraphs:
98
+ if method == 'words':
99
+ chunks.extend(chunk_text_by_words(para, max_size, overlap))
100
+ elif method == 'sentences':
101
+ chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
102
+ else:
103
+ chunks.append(para)
104
+
105
+ return chunks
106
+
107
+
108
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
109
+ words = text.split()
110
+ chunks = []
111
+ for i in range(0, len(words), max_words - overlap):
112
+ chunk = ' '.join(words[i:i + max_words])
113
+ chunks.append(chunk)
114
+ return post_process_chunks(chunks)
115
+
116
+
117
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
118
+ str]:
119
+ nltk.download('punkt', quiet=True)
120
+ sentences = nltk.sent_tokenize(text, language=language)
121
+ chunks = []
122
+ for i in range(0, len(sentences), max_sentences - overlap):
123
+ chunk = ' '.join(sentences[i:i + max_sentences])
124
+ chunks.append(chunk)
125
+ return post_process_chunks(chunks)
126
+
127
+
128
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
129
+ paragraphs = re.split(r'\n\s*\n', text)
130
+ chunks = []
131
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
132
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
133
+ chunks.append(chunk)
134
+ return post_process_chunks(chunks)
135
+
136
+
137
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
138
+ # This is a simplified token-based chunking. For more accurate tokenization,
139
+ # consider using a proper tokenizer like GPT-2 TokenizerFast
140
+ words = text.split()
141
+ chunks = []
142
+ current_chunk = []
143
+ current_token_count = 0
144
+
145
+ for word in words:
146
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
147
+ if current_token_count + word_token_count > max_tokens and current_chunk:
148
+ chunks.append(' '.join(current_chunk))
149
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
150
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
151
+
152
+ current_chunk.append(word)
153
+ current_token_count += word_token_count
154
+
155
+ if current_chunk:
156
+ chunks.append(' '.join(current_chunk))
157
+
158
+ return post_process_chunks(chunks)
159
+
160
+
161
+ def post_process_chunks(chunks: List[str]) -> List[str]:
162
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
163
+
164
+
165
+ def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
166
+ try:
167
+ start_index = full_text.index(chunk)
168
+ metadata = {
169
+ 'start_index': start_index,
170
+ 'end_index': start_index + len(chunk),
171
+ 'word_count': len(chunk.split()),
172
+ 'char_count': len(chunk),
173
+ 'chunk_type': chunk_type
174
+ }
175
+ if chunk_type == "chapter":
176
+ metadata['chapter_number'] = chapter_number
177
+ metadata['chapter_pattern'] = chapter_pattern
178
+ return metadata
179
+ except ValueError as e:
180
+ logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}")
181
+ raise
182
+
183
+
184
+ # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
185
+ def chunk_text_hybrid(text, max_tokens=1000):
186
+ sentences = nltk.tokenize.sent_tokenize(text)
187
+ chunks = []
188
+ current_chunk = []
189
+ current_length = 0
190
+
191
+ for sentence in sentences:
192
+ tokens = tokenizer.encode(sentence)
193
+ if current_length + len(tokens) <= max_tokens:
194
+ current_chunk.append(sentence)
195
+ current_length += len(tokens)
196
+ else:
197
+ chunks.append(' '.join(current_chunk))
198
+ current_chunk = [sentence]
199
+ current_length = len(tokens)
200
+
201
+ if current_chunk:
202
+ chunks.append(' '.join(current_chunk))
203
+
204
+ return chunks
205
+
206
+ # Thanks openai
207
+ def chunk_on_delimiter(input_string: str,
208
+ max_tokens: int,
209
+ delimiter: str) -> List[str]:
210
+ chunks = input_string.split(delimiter)
211
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
212
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
213
+ if dropped_chunk_count > 0:
214
+ print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
215
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
216
+ return combined_chunks
217
+
218
+ # ????FIXME
219
+ def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
220
+ summarized_chunks = []
221
+ current_summary = ""
222
+
223
+ logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
224
+ logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
225
+ for i, chunk in enumerate(chunks):
226
+ if i == 0:
227
+ current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
228
+ else:
229
+ combined_text = current_summary + "\n\n" + chunk
230
+ current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
231
+
232
+ summarized_chunks.append(current_summary)
233
+
234
+ return summarized_chunks
235
+
236
+
237
+ # Sample text for testing
238
+ sample_text = """
239
+ Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
240
+ concerned with the interactions between computers and human language, in particular how to program computers
241
+ to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
242
+ the contents of documents, including the contextual nuances of the language within them. The technology can then
243
+ accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
244
+
245
+ Challenges in natural language processing frequently involve speech recognition, natural language understanding,
246
+ and natural language generation.
247
+
248
+ Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
249
+ "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
250
+ """
251
+
252
+ # Example usage of different chunking methods
253
+ # print("Chunking by words:")
254
+ # print(chunk_text_by_words(sample_text, max_words=50))
255
+ #
256
+ # print("\nChunking by sentences:")
257
+ # print(chunk_text_by_sentences(sample_text, max_sentences=2))
258
+ #
259
+ # print("\nChunking by paragraphs:")
260
+ # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
261
+ #
262
+ # print("\nChunking by tokens:")
263
+ # print(chunk_text_by_tokens(sample_text, max_tokens=50))
264
+ #
265
+ # print("\nHybrid chunking:")
266
+ # print(chunk_text_hybrid(sample_text, max_tokens=50))
267
+
268
+
269
+
270
+ #######################################################################################################################
271
+ #
272
+ # Experimental Semantic Chunking
273
+ #
274
+
275
+ # Chunk text into segments based on semantic similarity
276
+ def count_units(text, unit='tokens'):
277
+ if unit == 'words':
278
+ return len(text.split())
279
+ elif unit == 'tokens':
280
+ return len(word_tokenize(text))
281
+ elif unit == 'characters':
282
+ return len(text)
283
+ else:
284
+ raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
285
+
286
+
287
+ def semantic_chunking(text, max_chunk_size=2000, unit='words'):
288
+ nltk.download('punkt', quiet=True)
289
+ sentences = sent_tokenize(text)
290
+ vectorizer = TfidfVectorizer()
291
+ sentence_vectors = vectorizer.fit_transform(sentences)
292
+
293
+ chunks = []
294
+ current_chunk = []
295
+ current_size = 0
296
+
297
+ for i, sentence in enumerate(sentences):
298
+ sentence_size = count_units(sentence, unit)
299
+ if current_size + sentence_size > max_chunk_size and current_chunk:
300
+ chunks.append(' '.join(current_chunk))
301
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
302
+ current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
303
+ current_size = overlap_size
304
+
305
+ current_chunk.append(sentence)
306
+ current_size += sentence_size
307
+
308
+ if i + 1 < len(sentences):
309
+ current_vector = sentence_vectors[i]
310
+ next_vector = sentence_vectors[i + 1]
311
+ similarity = cosine_similarity(current_vector, next_vector)[0][0]
312
+ if similarity < 0.5 and current_size >= max_chunk_size // 2:
313
+ chunks.append(' '.join(current_chunk))
314
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
315
+ current_chunk = current_chunk[-3:]
316
+ current_size = overlap_size
317
+
318
+ if current_chunk:
319
+ chunks.append(' '.join(current_chunk))
320
+
321
+ return chunks
322
+
323
+
324
+ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
325
+ try:
326
+ with open(file_path, 'r', encoding='utf-8') as file:
327
+ content = file.read()
328
+
329
+ chunks = semantic_chunking(content, max_chunk_size, overlap)
330
+ return chunks
331
+ except Exception as e:
332
+ logging.error(f"Error chunking text file: {str(e)}")
333
+ return None
334
+ #######################################################################################################################
335
+
336
+
337
+
338
+
339
+
340
+
341
+ #######################################################################################################################
342
+ #
343
+ # OpenAI Rolling Summarization
344
+ #
345
+
346
+ client = OpenAI(api_key=openai_api_key)
347
+ def get_chat_completion(messages, model='gpt-4-turbo'):
348
+ response = client.chat.completions.create(
349
+ model=model,
350
+ messages=messages,
351
+ temperature=0,
352
+ )
353
+ return response.choices[0].message.content
354
+
355
+
356
+ # This function combines text chunks into larger blocks without exceeding a specified token count.
357
+ # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
358
+ def combine_chunks_with_no_minimum(
359
+ chunks: List[str],
360
+ max_tokens: int,
361
+ chunk_delimiter="\n\n",
362
+ header: Optional[str] = None,
363
+ add_ellipsis_for_overflow=False,
364
+ ) -> Tuple[List[str], List[int]]:
365
+ dropped_chunk_count = 0
366
+ output = [] # list to hold the final combined chunks
367
+ output_indices = [] # list to hold the indices of the final combined chunks
368
+ candidate = (
369
+ [] if header is None else [header]
370
+ ) # list to hold the current combined chunk candidate
371
+ candidate_indices = []
372
+ for chunk_i, chunk in enumerate(chunks):
373
+ chunk_with_header = [chunk] if header is None else [header, chunk]
374
+ # FIXME MAKE NOT OPENAI SPECIFIC
375
+ if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
376
+ print(f"warning: chunk overflow")
377
+ if (
378
+ add_ellipsis_for_overflow
379
+ # FIXME MAKE NOT OPENAI SPECIFIC
380
+ and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
381
+ ):
382
+ candidate.append("...")
383
+ dropped_chunk_count += 1
384
+ continue # this case would break downstream assumptions
385
+ # estimate token count with the current chunk added
386
+ # FIXME MAKE NOT OPENAI SPECIFIC
387
+ extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
388
+ # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
389
+ if extended_candidate_token_count > max_tokens:
390
+ output.append(chunk_delimiter.join(candidate))
391
+ output_indices.append(candidate_indices)
392
+ candidate = chunk_with_header # re-initialize candidate
393
+ candidate_indices = [chunk_i]
394
+ # otherwise keep extending the candidate
395
+ else:
396
+ candidate.append(chunk)
397
+ candidate_indices.append(chunk_i)
398
+ # add the remaining candidate to output if it's not empty
399
+ if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
400
+ output.append(chunk_delimiter.join(candidate))
401
+ output_indices.append(candidate_indices)
402
+ return output, output_indices, dropped_chunk_count
403
+
404
+
405
+ def rolling_summarize(text: str,
406
+ detail: float = 0,
407
+ model: str = 'gpt-4-turbo',
408
+ additional_instructions: Optional[str] = None,
409
+ minimum_chunk_size: Optional[int] = 500,
410
+ chunk_delimiter: str = ".",
411
+ summarize_recursively=False,
412
+ verbose=False):
413
+ """
414
+ Summarizes a given text by splitting it into chunks, each of which is summarized individually.
415
+ The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
416
+
417
+ Parameters:
418
+ - text (str): The text to be summarized.
419
+ - detail (float, optional): A value between 0 and 1
420
+ indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
421
+ detailed summary. Defaults to 0.
422
+ - additional_instructions (Optional[str], optional): Additional instructions to provide to the
423
+ model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
424
+ chunks. Defaults to 500.
425
+ - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
426
+ - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
427
+ - verbose (bool, optional): If True, prints detailed information about the chunking process.
428
+ Returns:
429
+ - str: The final compiled summary of the text.
430
+
431
+ The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
432
+ based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
433
+ `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
434
+ summarization process. The function returns a compiled summary of all chunks.
435
+ """
436
+
437
+ # check detail is set correctly
438
+ assert 0 <= detail <= 1
439
+
440
+ # interpolate the number of chunks based to get specified level of detail
441
+ max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
442
+ min_chunks = 1
443
+ num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
444
+
445
+ # adjust chunk_size based on interpolated number of chunks
446
+ # FIXME MAKE NOT OPENAI SPECIFIC
447
+ document_length = len(openai_tokenize(text))
448
+ chunk_size = max(minimum_chunk_size, document_length // num_chunks)
449
+ text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
450
+ if verbose:
451
+ print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
452
+ # FIXME MAKE NOT OPENAI SPECIFIC
453
+ print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
454
+
455
+ # set system message - FIXME
456
+ system_message_content = "Rewrite this text in summarized form."
457
+ if additional_instructions is not None:
458
+ system_message_content += f"\n\n{additional_instructions}"
459
+
460
+ accumulated_summaries = []
461
+ for i, chunk in enumerate(tqdm(text_chunks)):
462
+ if summarize_recursively and accumulated_summaries:
463
+ # Combine previous summary with current chunk for recursive summarization
464
+ combined_text = accumulated_summaries[-1] + "\n\n" + chunk
465
+ user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
466
+ else:
467
+ user_message_content = chunk
468
+
469
+ messages = [
470
+ {"role": "system", "content": system_message_content},
471
+ {"role": "user", "content": user_message_content}
472
+ ]
473
+
474
+ response = get_chat_completion(messages, model=model)
475
+ accumulated_summaries.append(response)
476
+
477
+ final_summary = '\n\n'.join(accumulated_summaries)
478
+ return final_summary
479
+
480
+ #
481
+ #
482
+ #######################################################################################################################
483
+ #
484
+ # Ebook Chapter Chunking
485
+
486
+
487
+ def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
488
+ max_chunk_size = chunk_options.get('max_size', 300)
489
+ overlap = chunk_options.get('overlap', 0)
490
+ custom_pattern = chunk_options.get('custom_chapter_pattern', None)
491
+
492
+ # List of chapter heading patterns to try, in order
493
+ chapter_patterns = [
494
+ custom_pattern,
495
+ r'^#{1,2}\s+', # Markdown style: '# ' or '## '
496
+ r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
497
+ r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
498
+ r'^[A-Z\s]+$' # All caps headings
499
+ ]
500
+
501
+ chapter_positions = []
502
+ used_pattern = None
503
+
504
+ for pattern in chapter_patterns:
505
+ if pattern is None:
506
+ continue
507
+ chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
508
+ chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
509
+ if chapter_positions:
510
+ used_pattern = pattern
511
+ break
512
+
513
+ # If no chapters found, return the entire content as one chunk
514
+ if not chapter_positions:
515
+ return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
516
+
517
+ # Split content into chapters
518
+ chunks = []
519
+ for i in range(len(chapter_positions)):
520
+ start = chapter_positions[i]
521
+ end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
522
+ chapter = text[start:end]
523
+
524
+ # Apply overlap if specified
525
+ if overlap > 0 and i > 0:
526
+ overlap_start = max(0, start - overlap)
527
+ chapter = text[overlap_start:end]
528
+
529
+ chunks.append(chapter)
530
+
531
+ # Post-process chunks
532
+ processed_chunks = post_process_chunks(chunks)
533
+
534
+ # Add metadata to chunks
535
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
536
+ chapter_pattern=used_pattern)}
537
+ for i, chunk in enumerate(processed_chunks)]
538
+
539
+
540
+ # # Example usage
541
+ # if __name__ == "__main__":
542
+ # sample_ebook_content = """
543
+ # # Chapter 1: Introduction
544
+ #
545
+ # This is the introduction.
546
+ #
547
+ # ## Section 1.1
548
+ #
549
+ # Some content here.
550
+ #
551
+ # # Chapter 2: Main Content
552
+ #
553
+ # This is the main content.
554
+ #
555
+ # ## Section 2.1
556
+ #
557
+ # More content here.
558
+ #
559
+ # CHAPTER THREE
560
+ #
561
+ # This is the third chapter.
562
+ #
563
+ # 4. Fourth Chapter
564
+ #
565
+ # This is the fourth chapter.
566
+ # """
567
+ #
568
+ # chunk_options = {
569
+ # 'method': 'chapters',
570
+ # 'max_size': 500,
571
+ # 'overlap': 50,
572
+ # 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
573
+ # }
574
+ #
575
+ # chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
576
+ #
577
+ # for i, chunk in enumerate(chunked_chapters, 1):
578
+ # print(f"Chunk {i}:")
579
+ # print(chunk['text'])
580
+ # print(f"Metadata: {chunk['metadata']}\n")
581
+
582
+
583
+
584
+
585
+ #
586
+ # End of Chunking Library
587
  #######################################################################################################################