img2poem / app.py
Slyfox12's picture
Update app.py
8ae001c verified
import requests
import os
import streamlit as st
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
# load_dotenv(find_dotenv())
# HUGGINGFACEHUB_API_TOKEN = os.getenv("token")
headers = {"Authorization": f"Bearer {API_TOKEN}"}
def img2text(path):
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
def query(filename):
with open(filename, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()
output = query(path)[0]['generated_text']
print(output)
return output
def generate_story(scene):
template = f'''
You are a poet;
You can generate a poem from a simple narrative, understand the theme, and use proper rhyming words.
The poem should not be shorter than 16 lines and not be longer than 20 lines.
Scenario: {scene}
Write a poem based on the provided scenario.
'''
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
story = query({
"inputs": template,
})
story = str(story[0]['generated_text']).split("\n")
story = story[12:]
s = ""
for i in story:
s += (i+"\n")
story = s
del(s)
print(story)
return story
def gen_audio(message):
device = "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
prompt = message
description = "A female speaker with a slightly low-pitched, quite expressive voice delivers her words at a normal pace in a poetic manner with proper pauses while speaking inside a confined space with very clear audio."
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("audio.wav", audio_arr, model.config.sampling_rate)
def main():
st.set_page_config(page_title="img 2 poem", page_icon="🤖")
st.header("Trun image into poem")
uploaded_file = st.file_uploader("choose an image.....", type=["png","jpg","jpeg","svg"])
if uploaded_file is not None:
print(uploaded_file)
bytes_data = uploaded_file.getvalue()
print(bytes_data)
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
st.image(uploaded_file, caption="Uploaded Image")
scenario = img2text(uploaded_file.name)
story = generate_story(scenario)
gen_audio(story)
with st.expander("Scenario"):
st.write(scenario)
with st.expander("Poem"):
st.write(story)
st.audio("audio.wav")
if __name__ == "__main__":
main()