Spaces:
Running
add you.com integration (#514)
Browse files* add you.com integration
* [Feat] Add types (#1)
* feat: add types
* feat: specify search provider
* [Feat] add return values (#3)
* feat: add types
* feat: specify search provider
* add values for ui
* add idea to gitignore
* lint and downgrade
* Revert "lint and downgrade"
This reverts commit fbfe012cf4d6aaf5ea00508ca0f99a408ce152e6.
* Updates to you.com integration (#4)
* `npm run format`
* enable search when `YDC_API_KEY` is available
* typing nit
* enum WebSearchProvider
* eslint-disable-next-line no-shadow
* downgrade
* You.com already provides text snippets (#5)
* Order desc you.com results by text length
* You.com already provides texts from webpages
* add to env
* Update .env
Co-authored-by: Mishig <[email protected]>
* Update src/lib/server/websearch/searchWeb.ts
---------
Co-authored-by: David Li <[email protected]>
Co-authored-by: Mishig <[email protected]>
- .env +1 -0
- .gitignore +2 -1
- README.md +4 -4
- src/lib/server/websearch/runWebSearch.ts +14 -10
- src/lib/server/websearch/searchWeb.ts +40 -3
- src/lib/types/WebSearch.ts +19 -0
- src/routes/+layout.server.ts +7 -2
@@ -10,6 +10,7 @@ HF_ACCESS_TOKEN=#hf_<token> from from https://huggingface.co/settings/token
|
|
10 |
HF_API_ROOT=https://api-inference.huggingface.co/models
|
11 |
|
12 |
# used to activate search with web functionality. disabled if none are defined. choose one of the following:
|
|
|
13 |
SERPER_API_KEY=#your serper.dev api key here
|
14 |
SERPAPI_KEY=#your serpapi key here
|
15 |
|
|
|
10 |
HF_API_ROOT=https://api-inference.huggingface.co/models
|
11 |
|
12 |
# used to activate search with web functionality. disabled if none are defined. choose one of the following:
|
13 |
+
YDC_API_KEY=#your docs.you.com api key here
|
14 |
SERPER_API_KEY=#your serper.dev api key here
|
15 |
SERPAPI_KEY=#your serpapi key here
|
16 |
|
@@ -9,4 +9,5 @@ node_modules
|
|
9 |
!.env.template
|
10 |
vite.config.js.timestamp-*
|
11 |
vite.config.ts.timestamp-*
|
12 |
-
SECRET_CONFIG
|
|
|
|
9 |
!.env.template
|
10 |
vite.config.js.timestamp-*
|
11 |
vite.config.ts.timestamp-*
|
12 |
+
SECRET_CONFIG
|
13 |
+
.idea
|
@@ -76,8 +76,8 @@ npm run dev
|
|
76 |
|
77 |
Chat UI features a powerful Web Search feature. It works by:
|
78 |
|
79 |
-
1. Generating an appropriate
|
80 |
-
2. Performing
|
81 |
3. Creating embeddings from texts using [transformers.js](https://huggingface.co/docs/transformers.js). Specifically, using [Xenova/gte-small](https://huggingface.co/Xenova/gte-small) model.
|
82 |
4. From these embeddings, find the ones that are closest to the user query using vector similarity search. Specifically, we use `inner product` distance.
|
83 |
5. Get the corresponding texts to those closest embeddings and perform [Retrieval-Augmented Generation](https://huggingface.co/papers/2005.11401) (i.e. expand user prompt by adding those texts so that a LLM can use this information).
|
@@ -122,7 +122,7 @@ PUBLIC_APP_DISCLAIMER=
|
|
122 |
|
123 |
### Web Search config
|
124 |
|
125 |
-
You can enable the web search by adding
|
126 |
|
127 |
### Custom models
|
128 |
|
@@ -209,7 +209,7 @@ The following is the default `webSearchQueryPromptTemplate`.
|
|
209 |
```prompt
|
210 |
{{userMessageToken}}
|
211 |
My question is: {{message.content}}.
|
212 |
-
Based on the conversation history (my previous questions are: {{previousMessages}}), give me an appropriate query to answer my question for
|
213 |
{{userMessageEndToken}}
|
214 |
{{assistantMessageToken}}
|
215 |
```
|
|
|
76 |
|
77 |
Chat UI features a powerful Web Search feature. It works by:
|
78 |
|
79 |
+
1. Generating an appropriate search query from the user prompt.
|
80 |
+
2. Performing web search and extracting content from webpages.
|
81 |
3. Creating embeddings from texts using [transformers.js](https://huggingface.co/docs/transformers.js). Specifically, using [Xenova/gte-small](https://huggingface.co/Xenova/gte-small) model.
|
82 |
4. From these embeddings, find the ones that are closest to the user query using vector similarity search. Specifically, we use `inner product` distance.
|
83 |
5. Get the corresponding texts to those closest embeddings and perform [Retrieval-Augmented Generation](https://huggingface.co/papers/2005.11401) (i.e. expand user prompt by adding those texts so that a LLM can use this information).
|
|
|
122 |
|
123 |
### Web Search config
|
124 |
|
125 |
+
You can enable the web search by adding any of `YDC_API_KEY` ([docs.you.com](https://docs.you.com)) or `SERPER_API_KEY` ([serper.dev](https://serper.dev/)) or `SERPAPI_KEY` ([serpapi.com](https://serpapi.com/)) to your `.env.local`.
|
126 |
|
127 |
### Custom models
|
128 |
|
|
|
209 |
```prompt
|
210 |
{{userMessageToken}}
|
211 |
My question is: {{message.content}}.
|
212 |
+
Based on the conversation history (my previous questions are: {{previousMessages}}), give me an appropriate query to answer my question for web search. You should not say more than query. You should not say any words except the query. For the context, today is {{currentDate}}
|
213 |
{{userMessageEndToken}}
|
214 |
{{assistantMessageToken}}
|
215 |
```
|
@@ -10,6 +10,7 @@ import {
|
|
10 |
} from "$lib/server/websearch/sentenceSimilarity";
|
11 |
import type { Conversation } from "$lib/types/Conversation";
|
12 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
|
|
13 |
|
14 |
const MAX_N_PAGES_SCRAPE = 10 as const;
|
15 |
const MAX_N_PAGES_EMBED = 5 as const;
|
@@ -39,14 +40,15 @@ export async function runWebSearch(
|
|
39 |
|
40 |
try {
|
41 |
webSearch.searchQuery = await generateQuery(messages);
|
42 |
-
|
|
|
43 |
const results = await searchWeb(webSearch.searchQuery);
|
44 |
webSearch.results =
|
45 |
(results.organic_results &&
|
46 |
-
results.organic_results.map((el: { title: string; link: string }) => {
|
47 |
-
const { title, link } = el;
|
48 |
const { hostname } = new URL(link);
|
49 |
-
return { title, link, hostname };
|
50 |
})) ??
|
51 |
[];
|
52 |
webSearch.results = webSearch.results
|
@@ -58,12 +60,14 @@ export async function runWebSearch(
|
|
58 |
appendUpdate("Browsing results");
|
59 |
const promises = webSearch.results.map(async (result) => {
|
60 |
const { link } = result;
|
61 |
-
let text = "";
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
67 |
}
|
68 |
const MAX_N_CHUNKS = 100;
|
69 |
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
|
|
|
10 |
} from "$lib/server/websearch/sentenceSimilarity";
|
11 |
import type { Conversation } from "$lib/types/Conversation";
|
12 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
13 |
+
import { getWebSearchProvider } from "./searchWeb";
|
14 |
|
15 |
const MAX_N_PAGES_SCRAPE = 10 as const;
|
16 |
const MAX_N_PAGES_EMBED = 5 as const;
|
|
|
40 |
|
41 |
try {
|
42 |
webSearch.searchQuery = await generateQuery(messages);
|
43 |
+
const searchProvider = getWebSearchProvider();
|
44 |
+
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
|
45 |
const results = await searchWeb(webSearch.searchQuery);
|
46 |
webSearch.results =
|
47 |
(results.organic_results &&
|
48 |
+
results.organic_results.map((el: { title: string; link: string; text?: string }) => {
|
49 |
+
const { title, link, text } = el;
|
50 |
const { hostname } = new URL(link);
|
51 |
+
return { title, link, hostname, text };
|
52 |
})) ??
|
53 |
[];
|
54 |
webSearch.results = webSearch.results
|
|
|
60 |
appendUpdate("Browsing results");
|
61 |
const promises = webSearch.results.map(async (result) => {
|
62 |
const { link } = result;
|
63 |
+
let text = result.text ?? "";
|
64 |
+
if (!text) {
|
65 |
+
try {
|
66 |
+
text = await parseWeb(link);
|
67 |
+
appendUpdate("Browsing webpage", [link]);
|
68 |
+
} catch (e) {
|
69 |
+
// ignore errors
|
70 |
+
}
|
71 |
}
|
72 |
const MAX_N_CHUNKS = 100;
|
73 |
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
|
@@ -1,17 +1,26 @@
|
|
1 |
-
import {
|
2 |
-
|
|
|
3 |
import { getJson } from "serpapi";
|
4 |
import type { GoogleParameters } from "serpapi";
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
// Show result as JSON
|
7 |
export async function searchWeb(query: string) {
|
8 |
if (SERPER_API_KEY) {
|
9 |
return await searchWebSerper(query);
|
10 |
}
|
|
|
|
|
|
|
11 |
if (SERPAPI_KEY) {
|
12 |
return await searchWebSerpApi(query);
|
13 |
}
|
14 |
-
throw new Error("No Serper.dev or SerpAPI key found");
|
15 |
}
|
16 |
|
17 |
export async function searchWebSerper(query: string) {
|
@@ -59,3 +68,31 @@ export async function searchWebSerpApi(query: string) {
|
|
59 |
|
60 |
return response;
|
61 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { YouWebSearch } from "../../types/WebSearch";
|
2 |
+
import { WebSearchProvider } from "../../types/WebSearch";
|
3 |
+
import { SERPAPI_KEY, SERPER_API_KEY, YDC_API_KEY } from "$env/static/private";
|
4 |
import { getJson } from "serpapi";
|
5 |
import type { GoogleParameters } from "serpapi";
|
6 |
|
7 |
+
// get which SERP api is providing web results
|
8 |
+
export function getWebSearchProvider() {
|
9 |
+
return YDC_API_KEY ? WebSearchProvider.YOU : WebSearchProvider.GOOGLE;
|
10 |
+
}
|
11 |
+
|
12 |
// Show result as JSON
|
13 |
export async function searchWeb(query: string) {
|
14 |
if (SERPER_API_KEY) {
|
15 |
return await searchWebSerper(query);
|
16 |
}
|
17 |
+
if (YDC_API_KEY) {
|
18 |
+
return await searchWebYouApi(query);
|
19 |
+
}
|
20 |
if (SERPAPI_KEY) {
|
21 |
return await searchWebSerpApi(query);
|
22 |
}
|
23 |
+
throw new Error("No You.com or Serper.dev or SerpAPI key found");
|
24 |
}
|
25 |
|
26 |
export async function searchWebSerper(query: string) {
|
|
|
68 |
|
69 |
return response;
|
70 |
}
|
71 |
+
|
72 |
+
export async function searchWebYouApi(query: string) {
|
73 |
+
const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
|
74 |
+
method: "GET",
|
75 |
+
headers: {
|
76 |
+
"X-API-Key": YDC_API_KEY,
|
77 |
+
"Content-type": "application/json; charset=UTF-8",
|
78 |
+
},
|
79 |
+
});
|
80 |
+
|
81 |
+
if (!response.ok) {
|
82 |
+
throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
|
83 |
+
}
|
84 |
+
|
85 |
+
const data = (await response.json()) as YouWebSearch;
|
86 |
+
const formattedResultsWithSnippets = data.hits
|
87 |
+
.map(({ title, url, snippets }) => ({
|
88 |
+
title,
|
89 |
+
link: url,
|
90 |
+
text: snippets?.join("\n") || "",
|
91 |
+
hostname: new URL(url).hostname,
|
92 |
+
}))
|
93 |
+
.sort((a, b) => b.text.length - a.text.length); // desc order by text length
|
94 |
+
|
95 |
+
return {
|
96 |
+
organic_results: formattedResultsWithSnippets,
|
97 |
+
};
|
98 |
+
}
|
@@ -18,9 +18,28 @@ export interface WebSearchSource {
|
|
18 |
title: string;
|
19 |
link: string;
|
20 |
hostname: string;
|
|
|
21 |
}
|
22 |
|
23 |
export type WebSearchMessageSources = {
|
24 |
type: "sources";
|
25 |
sources: WebSearchSource[];
|
26 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
title: string;
|
19 |
link: string;
|
20 |
hostname: string;
|
21 |
+
text?: string; // You.com provides text of webpage right away
|
22 |
}
|
23 |
|
24 |
export type WebSearchMessageSources = {
|
25 |
type: "sources";
|
26 |
sources: WebSearchSource[];
|
27 |
};
|
28 |
+
|
29 |
+
export interface YouWebSearch {
|
30 |
+
hits: YouSearchHit[];
|
31 |
+
latency: number;
|
32 |
+
}
|
33 |
+
|
34 |
+
interface YouSearchHit {
|
35 |
+
url: string;
|
36 |
+
title: string;
|
37 |
+
description: string;
|
38 |
+
snippets: string[];
|
39 |
+
}
|
40 |
+
|
41 |
+
// eslint-disable-next-line no-shadow
|
42 |
+
export enum WebSearchProvider {
|
43 |
+
GOOGLE = "Google",
|
44 |
+
YOU = "You.com",
|
45 |
+
}
|
@@ -6,7 +6,12 @@ import { UrlDependency } from "$lib/types/UrlDependency";
|
|
6 |
import { defaultModel, models, oldModels, validateModel } from "$lib/server/models";
|
7 |
import { authCondition, requiresUser } from "$lib/server/auth";
|
8 |
import { DEFAULT_SETTINGS } from "$lib/types/Settings";
|
9 |
-
import {
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
|
12 |
const { conversations } = collections;
|
@@ -82,7 +87,7 @@ export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
|
|
82 |
ethicsModalAcceptedAt: settings?.ethicsModalAcceptedAt ?? null,
|
83 |
activeModel: settings?.activeModel ?? DEFAULT_SETTINGS.activeModel,
|
84 |
hideEmojiOnSidebar: settings?.hideEmojiOnSidebar ?? false,
|
85 |
-
searchEnabled: !!(SERPAPI_KEY || SERPER_API_KEY),
|
86 |
customPrompts: settings?.customPrompts ?? {},
|
87 |
},
|
88 |
models: models.map((model) => ({
|
|
|
6 |
import { defaultModel, models, oldModels, validateModel } from "$lib/server/models";
|
7 |
import { authCondition, requiresUser } from "$lib/server/auth";
|
8 |
import { DEFAULT_SETTINGS } from "$lib/types/Settings";
|
9 |
+
import {
|
10 |
+
SERPAPI_KEY,
|
11 |
+
SERPER_API_KEY,
|
12 |
+
MESSAGES_BEFORE_LOGIN,
|
13 |
+
YDC_API_KEY,
|
14 |
+
} from "$env/static/private";
|
15 |
|
16 |
export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
|
17 |
const { conversations } = collections;
|
|
|
87 |
ethicsModalAcceptedAt: settings?.ethicsModalAcceptedAt ?? null,
|
88 |
activeModel: settings?.activeModel ?? DEFAULT_SETTINGS.activeModel,
|
89 |
hideEmojiOnSidebar: settings?.hideEmojiOnSidebar ?? false,
|
90 |
+
searchEnabled: !!(SERPAPI_KEY || SERPER_API_KEY || YDC_API_KEY),
|
91 |
customPrompts: settings?.customPrompts ?? {},
|
92 |
},
|
93 |
models: models.map((model) => ({
|