Spaces:
Running
Running
Option for running websearch locally (#563)
Browse files* Fully local websearch
* better comments
* fix else if
.env
CHANGED
@@ -14,6 +14,7 @@ OPENAI_API_KEY=#your openai api key here
|
|
14 |
YDC_API_KEY=#your docs.you.com api key here
|
15 |
SERPER_API_KEY=#your serper.dev api key here
|
16 |
SERPAPI_KEY=#your serpapi key here
|
|
|
17 |
|
18 |
# Parameters to enable open id login
|
19 |
OPENID_CONFIG=`{
|
|
|
14 |
YDC_API_KEY=#your docs.you.com api key here
|
15 |
SERPER_API_KEY=#your serper.dev api key here
|
16 |
SERPAPI_KEY=#your serpapi key here
|
17 |
+
USE_LOCAL_WEBSEARCH=#set to true to parse google results yourself, overrides other API keys
|
18 |
|
19 |
# Parameters to enable open id login
|
20 |
OPENID_CONFIG=`{
|
src/lib/server/websearch/runWebSearch.ts
CHANGED
@@ -15,6 +15,8 @@ import { getWebSearchProvider } from "./searchWeb";
|
|
15 |
const MAX_N_PAGES_SCRAPE = 10 as const;
|
16 |
const MAX_N_PAGES_EMBED = 5 as const;
|
17 |
|
|
|
|
|
18 |
export async function runWebSearch(
|
19 |
conv: Conversation,
|
20 |
prompt: string,
|
@@ -45,14 +47,14 @@ export async function runWebSearch(
|
|
45 |
const results = await searchWeb(webSearch.searchQuery);
|
46 |
webSearch.results =
|
47 |
(results.organic_results &&
|
48 |
-
results.organic_results.map((el: { title
|
49 |
const { title, link, text } = el;
|
50 |
const { hostname } = new URL(link);
|
51 |
return { title, link, hostname, text };
|
52 |
})) ??
|
53 |
[];
|
54 |
webSearch.results = webSearch.results
|
55 |
-
.filter(({ link }) => !link.includes(
|
56 |
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
|
57 |
|
58 |
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
|
|
|
15 |
const MAX_N_PAGES_SCRAPE = 10 as const;
|
16 |
const MAX_N_PAGES_EMBED = 5 as const;
|
17 |
|
18 |
+
const DOMAIN_BLOCKLIST = ["youtube.com", "twitter.com"];
|
19 |
+
|
20 |
export async function runWebSearch(
|
21 |
conv: Conversation,
|
22 |
prompt: string,
|
|
|
47 |
const results = await searchWeb(webSearch.searchQuery);
|
48 |
webSearch.results =
|
49 |
(results.organic_results &&
|
50 |
+
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
|
51 |
const { title, link, text } = el;
|
52 |
const { hostname } = new URL(link);
|
53 |
return { title, link, hostname, text };
|
54 |
})) ??
|
55 |
[];
|
56 |
webSearch.results = webSearch.results
|
57 |
+
.filter(({ link }) => !DOMAIN_BLOCKLIST.some((el) => link.includes(el))) // filter out blocklist links
|
58 |
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
|
59 |
|
60 |
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
|
src/lib/server/websearch/searchWeb.ts
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import type { YouWebSearch } from "../../types/WebSearch";
|
2 |
import { WebSearchProvider } from "../../types/WebSearch";
|
3 |
-
import { SERPAPI_KEY, SERPER_API_KEY, YDC_API_KEY } from "$env/static/private";
|
4 |
import { getJson } from "serpapi";
|
5 |
import type { GoogleParameters } from "serpapi";
|
|
|
6 |
|
7 |
// get which SERP api is providing web results
|
8 |
export function getWebSearchProvider() {
|
@@ -11,6 +12,9 @@ export function getWebSearchProvider() {
|
|
11 |
|
12 |
// Show result as JSON
|
13 |
export async function searchWeb(query: string) {
|
|
|
|
|
|
|
14 |
if (SERPER_API_KEY) {
|
15 |
return await searchWebSerper(query);
|
16 |
}
|
|
|
1 |
import type { YouWebSearch } from "../../types/WebSearch";
|
2 |
import { WebSearchProvider } from "../../types/WebSearch";
|
3 |
+
import { SERPAPI_KEY, SERPER_API_KEY, USE_LOCAL_WEBSEARCH, YDC_API_KEY } from "$env/static/private";
|
4 |
import { getJson } from "serpapi";
|
5 |
import type { GoogleParameters } from "serpapi";
|
6 |
+
import { searchWebLocal } from "./searchWebLocal";
|
7 |
|
8 |
// get which SERP api is providing web results
|
9 |
export function getWebSearchProvider() {
|
|
|
12 |
|
13 |
// Show result as JSON
|
14 |
export async function searchWeb(query: string) {
|
15 |
+
if (USE_LOCAL_WEBSEARCH) {
|
16 |
+
return await searchWebLocal(query);
|
17 |
+
}
|
18 |
if (SERPER_API_KEY) {
|
19 |
return await searchWebSerper(query);
|
20 |
}
|
src/lib/server/websearch/searchWebLocal.ts
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { JSDOM, VirtualConsole } from "jsdom";
|
2 |
+
|
3 |
+
export async function searchWebLocal(query: string) {
|
4 |
+
const abortController = new AbortController();
|
5 |
+
setTimeout(() => abortController.abort(), 10000);
|
6 |
+
|
7 |
+
const htmlString = await fetch("https://www.google.com/search?hl=en&q=" + query, {
|
8 |
+
signal: abortController.signal,
|
9 |
+
})
|
10 |
+
.then((response) => response.text())
|
11 |
+
.catch();
|
12 |
+
|
13 |
+
const virtualConsole = new VirtualConsole();
|
14 |
+
|
15 |
+
virtualConsole.on("error", () => {
|
16 |
+
// No-op to skip console errors.
|
17 |
+
});
|
18 |
+
|
19 |
+
// put the html string into a DOM
|
20 |
+
const dom = new JSDOM(htmlString ?? "", {
|
21 |
+
virtualConsole,
|
22 |
+
});
|
23 |
+
|
24 |
+
const { document } = dom.window;
|
25 |
+
// get all a documents with href tag
|
26 |
+
|
27 |
+
const links = document.querySelectorAll("a");
|
28 |
+
|
29 |
+
if (!links.length) {
|
30 |
+
throw new Error(`webpage doesn't have any "a" element`);
|
31 |
+
}
|
32 |
+
|
33 |
+
// take url that start wirth /url?q=
|
34 |
+
// and do not contain google.com links
|
35 |
+
// and strip them up to '&sa='
|
36 |
+
const linksHref = Array.from(links)
|
37 |
+
.filter((el) => el.href?.startsWith("/url?q=") && !el.href.includes("google.com/"))
|
38 |
+
.map((el) => {
|
39 |
+
const link = el.href;
|
40 |
+
return link.slice("/url?q=".length, link.indexOf("&sa="));
|
41 |
+
});
|
42 |
+
|
43 |
+
// remove duplicate links and map links to the correct object shape
|
44 |
+
return { organic_results: [...new Set(linksHref)].map((link) => ({ link })) };
|
45 |
+
}
|