File size: 3,642 Bytes
3505899
 
 
 
d6579b5
3505899
 
 
 
 
 
d6579b5
 
 
 
 
3505899
 
f93d314
3505899
 
 
 
 
 
 
 
 
490445b
 
 
 
 
 
 
 
 
3505899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490445b
 
3505899
 
 
 
 
 
 
 
 
6678ed6
2c475a1
6678ed6
2c475a1
 
 
d770ec6
2c475a1
 
 
 
 
 
6678ed6
2c475a1
d770ec6
2c475a1
d770ec6
2c475a1
d770ec6
2c475a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
import configparser
import os

config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
    assistant_api_key = config['DEFAULT'].get('API-KEY', '')

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]

st.title("Web Scraping Assistant")
st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you.")
st.write("Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)")
if assistant_api_key == '':
    assistant_api_key = st.text_input("Paste your API key here:")
    if assistant_api_key:
        gpt_assistant = GPTAssistant(assistant_api_key)
else:
    gpt_assistant = GPTAssistant(assistant_api_key)

html_content = st.text_input("Paste your piece of HTML here:")

extract_button = st.button("Extract data format")
if html_content and extract_button:
    try:
        output = gpt_assistant.chain_response_format(html_content)
        st.session_state['output_format'] = output
    except NameError:
        st.write("Complete the API key field")
    except AuthenticationError:
        st.write("Invalid API key")

if 'output_format' in st.session_state:
    output_format = st.code(st.session_state['output_format'], language="json")
    
    if st.button("Generate the code"):
        try:
            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
            st.session_state['code_generated'] = python_code
            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"

        except NameError:
            st.write("Complete the API key field")
        except AuthenticationError:
            st.write("Invalid API key")


if 'code_generated' in st.session_state:
    python_function_label = st.write("Here is your python function:")
    code_generated = st.code(st.session_state['code_generated'],language="python")
    full_content = st.text_input("Paste your complete HTML here:")
    test_code = st.button("Test the code")
    if full_content and test_code:
        html_data = full_content
        result = None
        exec(st.session_state['code_generated_exec'], globals())
        if result:
            st.write("data extracted successfully")
            # show data in table
            st.table(result)
        else:
            st.write("error extracting data")

with st.expander(label="How to use this app"):

    st.write("1. Paste the html code of your target element in the first text box and press \"Enter\"")
    example = st.button("Show example")
    if example:
        example = False
        text_area = st.text_area("Example", value='<li><div class="product"> <h3 class="title">Product 1</h3> <p class="description">This is the description of the product 1</p> <span class="price">10.00</span> </div></li>')
        close_example = st.button("Close example")
        if close_example:
            example = False
            close_example.disabled = True
            text_area = None

    st.write("2. Click on the button 'Extract data format'")

    st.write("3. Click on the button 'Generate the code'")

    st.write("4. Paste the complete html code in the last text box to test the auto generated code")

    st.write("5. Copy the code and include it in your own projects")