Kuberwastaken commited on
Commit
6149106
·
1 Parent(s): e15502b

Better model

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ treat-env
model/__pycache__/analyzer.cpython-310.pyc ADDED
Binary file (8.83 kB). View file
 
model/analyzer.py CHANGED
@@ -1,188 +1,250 @@
1
- import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
  from datetime import datetime
5
- import gc
6
- import json
7
- import os
8
- from huggingface_hub import login
9
-
10
- class ContentAnalyzer:
11
- def __init__(self):
12
- self.model_name = "meta-llama/Llama-3.2-1B"
13
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
- self.tokenizer = None
15
- self.model = None
16
- # Authenticate with Hugging Face
17
- if "HF_TOKEN" in os.environ:
18
- print("Authenticating with Hugging Face...")
19
- login(token=os.environ["HF_TOKEN"])
20
- else:
21
- print("Warning: HF_TOKEN not found in environment variables")
22
-
23
- def load_model(self):
24
- try:
25
- print("Loading tokenizer...")
26
- self.tokenizer = AutoTokenizer.from_pretrained(
27
- self.model_name,
28
- use_fast=True,
29
- token=os.environ.get("HF_TOKEN")
30
- )
31
-
32
- print(f"Loading model on {self.device}...")
33
- self.model = AutoModelForCausalLM.from_pretrained(
34
- self.model_name,
35
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
36
- low_cpu_mem_usage=True,
37
- device_map="auto",
38
- token=os.environ.get("HF_TOKEN")
39
- )
40
- return True
41
- except Exception as e:
42
- print(f"Model loading error: {str(e)}")
43
- return False
44
-
45
- def cleanup(self):
46
- if self.device == "cuda":
47
- torch.cuda.empty_cache()
48
- gc.collect()
49
-
50
- def analyze_chunk(self, chunk, category_info):
51
- mapped_name = category_info["mapped_name"]
52
- description = category_info["description"]
53
-
54
- print(f"\nAnalyzing for {mapped_name}...")
55
- prompt = f"""Check this text for any indication of {mapped_name} ({description}).
56
- Be sensitive to subtle references or implications, make sure the text is not metaphorical.
57
- Respond concisely with: YES, NO, or MAYBE.
58
- Text: {chunk}
59
- Answer:"""
60
-
61
- try:
62
- print(f"Sending prompt to model...")
63
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
64
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
65
-
66
- with torch.no_grad():
67
- print("Generating response...")
68
- outputs = self.model.generate(
69
- **inputs,
70
- max_new_tokens=10,
71
- do_sample=True,
72
- temperature=0.5,
73
- top_p=0.9,
74
- pad_token_id=self.tokenizer.eos_token_id
75
- )
76
 
77
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper()
78
- first_word = response.split("\n")[-1].split()[0] if response else "NO"
79
-
80
- print(f"Model response for {mapped_name}: {first_word}")
81
- if first_word == "YES":
82
- print(f"Detected {mapped_name} in this chunk!")
83
- elif first_word == "MAYBE":
84
- print(f"Possible {mapped_name} detected, marking for review.")
85
- else:
86
- print(f"No {mapped_name} detected in this chunk.")
87
-
88
- score = 1 if first_word == "YES" else 0.5 if first_word == "MAYBE" else 0
89
- return score, first_word
90
-
91
- except Exception as e:
92
- print(f"Chunk analysis error: {str(e)}")
93
- return 0, "NO"
94
-
95
- def analyze_text(self, text):
96
- print("\n=== Starting Analysis ===")
97
- print(f"Time: {datetime.now()}")
98
-
99
- if not self.load_model():
100
- return {
101
- "detected_triggers": {"0": "Error"},
102
- "confidence": "Low - Model loading failed",
103
- "model": self.model_name,
104
- "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
- }
106
 
107
- chunk_size = 256
108
- overlap = 15
109
- script_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Using the more detailed trigger categories
112
  trigger_categories = {
113
  "Violence": {
114
  "mapped_name": "Violence",
115
- "description": "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. Includes direct physical confrontations, implied violence, or large-scale events like wars, riots, or violent protests."
 
 
 
 
116
  },
117
  "Death": {
118
  "mapped_name": "Death References",
119
- "description": "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, mentions of deceased individuals, or abstract references to mortality."
 
 
 
 
120
  },
121
- "Substance_Use": {
122
  "mapped_name": "Substance Use",
123
- "description": "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances."
 
 
 
 
124
  },
125
  "Gore": {
126
  "mapped_name": "Gore",
127
- "description": "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm."
 
 
 
 
 
 
 
 
 
 
128
  },
129
- "Sexual_Content": {
130
  "mapped_name": "Sexual Content",
131
- "description": "Any depiction or mention of sexual activity, intimacy, or sexual behavior."
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  },
133
- "Self_Harm": {
134
  "mapped_name": "Self-Harm",
135
- "description": "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves."
 
 
 
136
  },
137
- "Gun_Use": {
138
  "mapped_name": "Gun Use",
139
- "description": "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner."
 
 
 
140
  },
141
- "Animal_Cruelty": {
142
  "mapped_name": "Animal Cruelty",
143
- "description": "Any act of harm, abuse, or neglect toward animals, whether intentional or accidental."
 
 
 
144
  },
145
- "Mental_Health": {
146
  "mapped_name": "Mental Health Issues",
147
- "description": "Any reference to mental health struggles, disorders, or psychological distress."
 
 
 
 
148
  }
149
  }
150
 
 
 
 
 
 
151
  identified_triggers = {}
152
 
153
  for chunk_idx, chunk in enumerate(script_chunks, 1):
154
  print(f"\n--- Processing Chunk {chunk_idx}/{len(script_chunks)} ---")
155
  for category, info in trigger_categories.items():
156
- score, response = self.analyze_chunk(chunk, info)
157
- if response == "YES":
158
- identified_triggers[category] = identified_triggers.get(category, 0) + 1
159
- elif response == "MAYBE":
160
- identified_triggers[category] = identified_triggers.get(category, 0) + 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- final_triggers = [category for category, count in identified_triggers.items() if count > 0.5]
163
- self.cleanup()
 
 
 
 
 
 
 
164
 
165
- print("\n=== Analysis Complete ===")
166
  if not final_triggers:
167
- result = {
168
- "detected_triggers": {"0": "None"},
169
- "confidence": "High - No concerning content detected",
170
- "model": self.model_name,
171
- "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
172
- }
173
- else:
174
- triggers_dict = {str(i): trigger for i, trigger in enumerate(final_triggers)}
175
- result = {
176
- "detected_triggers": triggers_dict,
177
- "confidence": "High - Content detected",
178
- "model": self.model_name,
179
- "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
180
- }
181
-
182
- print("\nFinal Result:", result)
183
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- def analyze_content(text):
186
- analyzer = ContentAnalyzer()
187
- result = analyzer.analyze_text(text)
188
- return json.dumps(result, indent=2)
 
1
+ import os
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
  from datetime import datetime
5
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Fetch the Hugging Face token from the environment variable (secrets)
8
+ hf_token = os.getenv("HF_TOKEN")
9
+
10
+ if not hf_token:
11
+ raise ValueError("HF_TOKEN environment variable is not set!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def analyze_script(script):
14
+ # Starting the script analysis
15
+ print("\n=== Starting Analysis ===")
16
+ print(f"Time: {datetime.now()}") # Outputting the current timestamp
17
+ print("Loading model and tokenizer...")
18
+
19
+ try:
20
+ # Load the tokenizer and model, selecting the appropriate device (CPU or CUDA)
21
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
22
+ device = "cuda" if torch.cuda.is_available() else "cpu" # Use CUDA if available, else use CPU
23
+ print(f"Using device: {device}")
24
+
25
+ # Set your Hugging Face token for Llama model access
26
+ hf_token = "your_huggingface_token_here"
27
+
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ "meta-llama/Llama-3.2-1B",
30
+ use_auth_token=hf_token, # Pass the token to authenticate
31
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32, # Use 16-bit precision for CUDA, 32-bit for CPU
32
+ device_map="auto" # Automatically map model to available device
33
+ )
34
+ print("Model loaded successfully")
35
+ except Exception as e:
36
+ print(f"An error occurred: {e}")
37
 
38
+ # Define trigger categories with their descriptions
39
  trigger_categories = {
40
  "Violence": {
41
  "mapped_name": "Violence",
42
+ "description": (
43
+ "Any act involving physical force or aggression intended to cause harm, injury, or death to a person, animal, or object. "
44
+ "Includes direct physical confrontations (e.g., fights, beatings, or assaults), implied violence (e.g., very graphical threats or descriptions of injuries), "
45
+ "or large-scale events like wars, riots, or violent protests."
46
+ )
47
  },
48
  "Death": {
49
  "mapped_name": "Death References",
50
+ "description": (
51
+ "Any mention, implication, or depiction of the loss of life, including direct deaths of characters, including mentions of deceased individuals, "
52
+ "or abstract references to mortality (e.g., 'facing the end' or 'gone forever'). This also covers depictions of funerals, mourning, "
53
+ "grieving, or any dialogue that centers around death, do not take metaphors into context that don't actually lead to death."
54
+ )
55
  },
56
+ "Substance Use": {
57
  "mapped_name": "Substance Use",
58
+ "description": (
59
+ "Any explicit or implied reference to the consumption, misuse, or abuse of drugs, alcohol, or other intoxicating substances. "
60
+ "Includes scenes of drinking, smoking, or drug use, whether recreational or addictive. May also cover references to withdrawal symptoms, "
61
+ "rehabilitation, or substance-related paraphernalia (e.g., needles, bottles, pipes)."
62
+ )
63
  },
64
  "Gore": {
65
  "mapped_name": "Gore",
66
+ "description": (
67
+ "Extremely detailed and graphic depictions of highly severe physical injuries, mutilation, or extreme bodily harm, often accompanied by descriptions of heavy blood, exposed organs, "
68
+ "or dismemberment. This includes war scenes with severe casualties, horror scenarios involving grotesque creatures, or medical procedures depicted with excessive detail."
69
+ )
70
+ },
71
+ "Vomit": {
72
+ "mapped_name": "Vomit",
73
+ "description": (
74
+ "Any reference to the act of vomiting, whether directly described, implied, or depicted in detail. This includes sounds or visual descriptions of the act, "
75
+ "mentions of nausea leading to vomiting, or its aftermath (e.g., the presence of vomit, cleaning it up, or characters reacting to it)."
76
+ )
77
  },
78
+ "Sexual Content": {
79
  "mapped_name": "Sexual Content",
80
+ "description": (
81
+ "Any depiction or mention of sexual activity, intimacy, or sexual behavior, ranging from implied scenes to explicit descriptions. "
82
+ "This includes romantic encounters, physical descriptions of characters in a sexual context, sexual dialogue, or references to sexual themes (e.g., harassment, innuendos)."
83
+ )
84
+ },
85
+ "Sexual Abuse": {
86
+ "mapped_name": "Sexual Abuse",
87
+ "description": (
88
+ "Any form of non-consensual sexual act, behavior, or interaction, involving coercion, manipulation, or physical force. "
89
+ "This includes incidents of sexual assault, molestation, exploitation, harassment, and any acts where an individual is subjected to sexual acts against their will or without their consent. "
90
+ "It also covers discussions or depictions of the aftermath of such abuse, such as trauma, emotional distress, legal proceedings, or therapy. "
91
+ "References to inappropriate sexual advances, groping, or any other form of sexual misconduct are also included, as well as the psychological and emotional impact on survivors. "
92
+ "Scenes where individuals are placed in sexually compromising situations, even if not directly acted upon, may also fall under this category."
93
+ )
94
  },
95
+ "Self-Harm": {
96
  "mapped_name": "Self-Harm",
97
+ "description": (
98
+ "Any mention or depiction of behaviors where an individual intentionally causes harm to themselves. This includes cutting, burning, or other forms of physical injury, "
99
+ "as well as suicidal ideation, suicide attempts, or discussions of self-destructive thoughts and actions. References to scars, bruises, or other lasting signs of self-harm are also included."
100
+ )
101
  },
102
+ "Gun Use": {
103
  "mapped_name": "Gun Use",
104
+ "description": (
105
+ "Any explicit or implied mention of firearms being handled, fired, or used in a threatening manner. This includes scenes of gun violence, references to shootings, "
106
+ "gun-related accidents, or the presence of firearms in a tense or dangerous context (e.g., holstered weapons during an argument)."
107
+ )
108
  },
109
+ "Animal Cruelty": {
110
  "mapped_name": "Animal Cruelty",
111
+ "description": (
112
+ "Any act of harm, abuse, or neglect toward animals, whether intentional or accidental. This includes physical abuse (e.g., hitting, injuring, or killing animals), "
113
+ "mental or emotional mistreatment (e.g., starvation, isolation), and scenes where animals are subjected to pain or suffering for human entertainment or experimentation."
114
+ )
115
  },
116
+ "Mental Health Issues": {
117
  "mapped_name": "Mental Health Issues",
118
+ "description": (
119
+ "Any reference to mental health struggles, disorders, or psychological distress. This includes mentions of depression, anxiety, PTSD, bipolar disorder, schizophrenia, "
120
+ "or other conditions. Scenes depicting therapy sessions, psychiatric treatment, or coping mechanisms (e.g., medication, journaling) are also included. May cover subtle hints "
121
+ "like a character expressing feelings of worthlessness, hopelessness, or detachment from reality."
122
+ )
123
  }
124
  }
125
 
126
+ print("\nProcessing text...") # Output indicating the text is being processed
127
+ chunk_size = 256 # Set the chunk size for text processing
128
+ overlap = 15 # Overlap between chunks for context preservation
129
+ script_chunks = [script[i:i + chunk_size] for i in range(0, len(script), chunk_size - overlap)]
130
+
131
  identified_triggers = {}
132
 
133
  for chunk_idx, chunk in enumerate(script_chunks, 1):
134
  print(f"\n--- Processing Chunk {chunk_idx}/{len(script_chunks)} ---")
135
  for category, info in trigger_categories.items():
136
+ mapped_name = info["mapped_name"]
137
+ description = info["description"]
138
+
139
+ print(f"\nAnalyzing for {mapped_name}...")
140
+ prompt = f"""
141
+ Check this text for any indication of {mapped_name} ({description}).
142
+ Be sensitive to subtle references or implications, make sure the text is not metaphorical.
143
+ Respond concisely with: YES, NO, or MAYBE.
144
+ Text: {chunk}
145
+ Answer:
146
+ """
147
+
148
+ print(f"Sending prompt to model...") # Indicate that prompt is being sent to the model
149
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) # Tokenize the prompt
150
+ inputs = {k: v.to(device) for k, v in inputs.items()} # Send inputs to the chosen device
151
+
152
+ with torch.no_grad(): # Disable gradient calculation for inference
153
+ print("Generating response...") # Indicate that the model is generating a response
154
+ outputs = model.generate(
155
+ **inputs,
156
+ max_new_tokens=3, # Limit response length
157
+ do_sample=True, # Enable sampling for more diverse output
158
+ temperature=0.5, # Control randomness of the output
159
+ top_p=0.9, # Use nucleus sampling
160
+ pad_token_id=tokenizer.eos_token_id # Pad token ID
161
+ )
162
+
163
+ response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().upper() # Decode and format the response
164
+ first_word = response_text.split("\n")[-1].split()[0] if response_text else "NO" # Get the first word of the response
165
+ print(f"Model response for {mapped_name}: {first_word}")
166
+
167
+ # Update identified triggers based on model response
168
+ if first_word == "YES":
169
+ print(f"Detected {mapped_name} in this chunk!") # Trigger detected
170
+ identified_triggers[mapped_name] = identified_triggers.get(mapped_name, 0) + 1
171
+ elif first_word == "MAYBE":
172
+ print(f"Possible {mapped_name} detected, marking for further review.") # Possible trigger detected
173
+ identified_triggers[mapped_name] = identified_triggers.get(mapped_name, 0) + 0.5
174
+ else:
175
+ print(f"No {mapped_name} detected in this chunk.") # No trigger detected
176
 
177
+ print("\n=== Analysis Complete ===") # Indicate that analysis is complete
178
+ print("Final Results:")
179
+ final_triggers = [] # List to store final triggers
180
+
181
+ # Filter and output the final trigger results
182
+ for mapped_name, count in identified_triggers.items():
183
+ if count > 0.5:
184
+ final_triggers.append(mapped_name)
185
+ print(f"- {mapped_name}: found in {count} chunks")
186
 
 
187
  if not final_triggers:
188
+ final_triggers = ["None"]
189
+
190
+ return final_triggers
191
+
192
+ def analyze_content(script):
193
+ triggers = analyze_script(script)
194
+
195
+ if isinstance(triggers, list) and triggers != ["None"]:
196
+ result = {
197
+ "detected_triggers": triggers,
198
+ "confidence": "High - Content detected",
199
+ "model": "Llama-3.2-1B",
200
+ "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
201
+ }
202
+ else:
203
+ result = {
204
+ "detected_triggers": ["None"],
205
+ "confidence": "High - No concerning content detected",
206
+ "model": "Llama-3.2-1B",
207
+ "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
208
+ }
209
+
210
+ print("\nFinal Result Dictionary:", result)
211
+ return result
212
+
213
+ # Define the Gradio interface
214
+ import gradio as gr
215
+ from model import analyze_script # Import the function to analyze the content
216
+
217
+ def analyze_content(script):
218
+ # Perform the analysis on the input script using the analyze_script function
219
+ triggers = analyze_script(script)
220
+
221
+ # Define the result based on the triggers found
222
+ if isinstance(triggers, list) and triggers != ["None"]:
223
+ result = {
224
+ "detected_triggers": triggers,
225
+ "confidence": "High - Content detected",
226
+ "model": "Llama-3.2-1B",
227
+ "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
228
+ }
229
+ else:
230
+ result = {
231
+ "detected_triggers": ["None"],
232
+ "confidence": "High - No concerning content detected",
233
+ "model": "Llama-3.2-1B",
234
+ "analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
235
+ }
236
+
237
+ print("\nFinal Result Dictionary:", result)
238
+ return result
239
+
240
+ # Create and launch the Gradio interface
241
+ iface = gr.Interface(
242
+ fn=analyze_content,
243
+ inputs=gr.Textbox(lines=8, label="Input Text"),
244
+ outputs=gr.JSON(),
245
+ title="Content Analysis",
246
+ description="Analyze text content for sensitive topics"
247
+ )
248
 
249
+ if __name__ == "__main__":
250
+ iface.launch()