viboognesh-doaz commited on
Commit
fbd44bb
·
1 Parent(s): b075822

fixed a few errors

Browse files
Files changed (2) hide show
  1. awsfunctions.py +26 -11
  2. pdf_processing.py +1 -1
awsfunctions.py CHANGED
@@ -82,33 +82,48 @@ def download_files_from_s3(local_folder, file_path_list):
82
  def download_folder_from_s3(local_folder, aws_folder_prefix):
83
  s3 = boto3.client('s3')
84
  bucket_name = os.getenv("AWS_BUCKET_NAME")
85
- folder_prefix = aws_folder_prefix
86
 
 
 
 
87
  try:
 
 
 
88
  # List objects in the S3 bucket
89
  paginator = s3.get_paginator('list_objects_v2')
90
- page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix)
91
-
92
- # Download filtered files
93
  for page in page_iterator:
94
  for obj in page.get('Contents', []):
95
  key = obj['Key']
96
 
 
 
 
 
97
  # Construct local file path
98
- local_path = os.path.join(local_folder, key)
99
  os.makedirs(os.path.dirname(local_path), exist_ok=True)
 
100
  try:
101
  print(f"Downloading: {key} -> {local_path}")
102
  s3.download_file(bucket_name, key, local_path)
103
  print(f"Downloaded: {local_path}")
104
- except Exception as e:
105
- print(f"Error downloading {key}: {e}")
106
- raise e
 
 
 
 
 
 
 
107
 
108
- except NoCredentialsError:
109
- print("No AWS credentials found.")
110
  except Exception as e:
111
- print(f"An error occurred: {e}")
112
 
113
  def delete_s3_folder(folder_path):
114
  bucket_name = os.getenv("AWS_BUCKET_NAME")
 
82
  def download_folder_from_s3(local_folder, aws_folder_prefix):
83
  s3 = boto3.client('s3')
84
  bucket_name = os.getenv("AWS_BUCKET_NAME")
 
85
 
86
+ if not bucket_name:
87
+ raise ValueError("AWS_BUCKET_NAME environment variable is not set")
88
+
89
  try:
90
+ # Create the local folder if it doesn't exist
91
+ os.makedirs(local_folder, exist_ok=True)
92
+
93
  # List objects in the S3 bucket
94
  paginator = s3.get_paginator('list_objects_v2')
95
+ page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=aws_folder_prefix)
96
+
97
+ # Process objects
98
  for page in page_iterator:
99
  for obj in page.get('Contents', []):
100
  key = obj['Key']
101
 
102
+ # Determine if it's a file or directory
103
+ if obj['Size'] == 0:
104
+ continue
105
+
106
  # Construct local file path
107
+ local_path = os.path.join(local_folder, os.path.relpath(key, aws_folder_prefix))
108
  os.makedirs(os.path.dirname(local_path), exist_ok=True)
109
+
110
  try:
111
  print(f"Downloading: {key} -> {local_path}")
112
  s3.download_file(bucket_name, key, local_path)
113
  print(f"Downloaded: {local_path}")
114
+ except ClientError as e:
115
+ if e.response['Error']['Code'] == 'AccessDenied':
116
+ print(f"Permission denied when trying to download {key}: {e}")
117
+ elif e.response['Error']['Code'] == 'NoSuchKey':
118
+ print(f"The object {key} does not exist in the bucket.")
119
+ elif e.response['Error']['Code'] == "":
120
+ pass
121
+ else:
122
+ print(f"An error occurred while downloading {key}: {e}")
123
+ raise e
124
 
 
 
125
  except Exception as e:
126
+ print(f"An unexpected error occurred : {e}")
127
 
128
  def delete_s3_folder(folder_path):
129
  bucket_name = os.getenv("AWS_BUCKET_NAME")
pdf_processing.py CHANGED
@@ -113,7 +113,7 @@ def process_pdf(pdf_file):
113
  if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
114
  temp_dir = tempfile.mkdtemp()
115
  download_folder_from_s3(local_folder=temp_dir, aws_folder_prefix=os.path.join(aws_prefix_path, "qdrant"))
116
- client = qdrant_client.QdrantClient(path=os.path.join(temp_dir))
117
  image_store = QdrantVectorStore(client = client , collection_name=f"image_collection")
118
  text_store = QdrantVectorStore(client = client , collection_name=f"text_collection")
119
  index = MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store, image_store=image_store)
 
113
  if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
114
  temp_dir = tempfile.mkdtemp()
115
  download_folder_from_s3(local_folder=temp_dir, aws_folder_prefix=os.path.join(aws_prefix_path, "qdrant"))
116
+ client = qdrant_client.QdrantClient(path=os.path.join(temp_dir, "qdrant"))
117
  image_store = QdrantVectorStore(client = client , collection_name=f"image_collection")
118
  text_store = QdrantVectorStore(client = client , collection_name=f"text_collection")
119
  index = MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store, image_store=image_store)