Spaces:

Slyfox12
/

img2poem

Sleeping

File size: 3,252 Bytes

import requests
import os
import streamlit as st
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

# load_dotenv(find_dotenv())
# HUGGINGFACEHUB_API_TOKEN = os.getenv("token")
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def img2text(path):

    API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"

    def query(filename):
        with open(filename, "rb") as f:
            data = f.read()
        response = requests.post(API_URL, headers=headers, data=data)
        return response.json()

    output = query(path)[0]['generated_text']

    print(output)
    return output



def generate_story(scene):
    template = f'''
                You are a poet;
                You can generate a poem from a simple narrative, understand the theme, and use proper rhyming words.
                The poem should not be shorter than 16 lines and not be longer than 20 lines.

                Scenario: {scene}

                Write a poem based on the provided scenario.
                '''

    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"

    def query(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()

    story = query({
        "inputs": template,
    })

    story = str(story[0]['generated_text']).split("\n")

    story = story[12:]
    s = ""
    for i in story:
        s += (i+"\n")

    story = s
    del(s)

    print(story)
    return story

def gen_audio(message):

    device = "cpu"

    model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
    tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

    prompt = message
    description = "A female speaker with a slightly low-pitched, quite expressive voice delivers her words at a normal  pace in a poetic manner with proper pauses while speaking inside a confined space with very clear audio."

    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()
    sf.write("audio.wav", audio_arr, model.config.sampling_rate)


def main():
    st.set_page_config(page_title="img 2 poem", page_icon="🤖")
    st.header("Trun image into poem")
    uploaded_file = st.file_uploader("choose an image.....", type=["png","jpg","jpeg","svg"])

    if uploaded_file is not None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        print(bytes_data)
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption="Uploaded Image")

        scenario = img2text(uploaded_file.name)
        story = generate_story(scenario)
        gen_audio(story)

        with st.expander("Scenario"):
            st.write(scenario)
        with st.expander("Poem"):
            st.write(story)

        st.audio("audio.wav")

if __name__ == "__main__":
    main()