Skip to content

Commit 6eff75c

Browse files
committed
feat: Add streamlit UI and categorical variable example
1 parent f241c8d commit 6eff75c

12 files changed

+556
-125
lines changed

1_eda.ipynb

+30-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
},
2323
{
2424
"cell_type": "code",
25-
"execution_count": 6,
25+
"execution_count": 2,
2626
"metadata": {},
2727
"outputs": [],
2828
"source": [
@@ -250,6 +250,34 @@
250250
"len(processed_df)"
251251
]
252252
},
253+
{
254+
"cell_type": "markdown",
255+
"metadata": {},
256+
"source": [
257+
"Let's take a quick look at our categories:"
258+
]
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": null,
263+
"metadata": {},
264+
"outputs": [],
265+
"source": [
266+
"categories = set()\n",
267+
"for category_list in processed_df[\"category\"]:\n",
268+
" categories.update(category_list)\n",
269+
"categories"
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": null,
275+
"metadata": {},
276+
"outputs": [],
277+
"source": [
278+
"len(categories)"
279+
]
280+
},
253281
{
254282
"cell_type": "markdown",
255283
"metadata": {},
@@ -280,7 +308,7 @@
280308
"outputs": [],
281309
"source": [
282310
"print(\"\\nNaN value counts:\")\n",
283-
"print(processed_df[[\"price\", \"review_count\", \"review_rating\"]].isna().sum())"
311+
"print(processed_df[[\"category\", \"price\", \"review_count\", \"review_rating\"]].isna().sum())"
284312
]
285313
}
286314
],

2_tabular_semantic_search_superlinked.ipynb

+156-99
Large diffs are not rendered by default.

3_tabular_semantic_search_text_to_sql.ipynb

+13-13
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"name": "stderr",
1010
"output_type": "stream",
1111
"text": [
12-
"\u001b[32m2024-12-11 18:15:51.463\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msuperlinked_app.config\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m9\u001b[0m - \u001b[1mLoading '.env' file from: /Users/pauliusztin/Documents/01_projects/hands-on-retrieval/.env\u001b[0m\n"
12+
"\u001b[32m2024-12-12 18:33:52.629\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msuperlinked_app.config\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m9\u001b[0m - \u001b[1mLoading '.env' file from: /Users/pauliusztin/Documents/01_projects/hands-on-retrieval/.env\u001b[0m\n"
1313
]
1414
}
1515
],
@@ -258,7 +258,7 @@
258258
}
259259
],
260260
"source": [
261-
"df = pd.read_json(settings.PROCESSED_DATASET_PATH, lines=True)\n",
261+
"df = pd.read_json(settings.PROCESSED_DATASET_PATH, lines=True).drop(columns=['category'])\n",
262262
"df.head()"
263263
]
264264
},
@@ -378,7 +378,7 @@
378378
{
379379
"data": {
380380
"text/markdown": [
381-
"**Node ID:** 92b1a07a-ac84-4da8-aab3-d562fe52edaf<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'All Aboard! New York: A City Primer', 'price': 9.99}<br>"
381+
"**Node ID:** fe83bde8-54c6-46a7-bd31-7661bab4f8a8<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'All Aboard! New York: A City Primer', 'price': 9.99}<br>"
382382
],
383383
"text/plain": [
384384
"<IPython.core.display.Markdown object>"
@@ -390,7 +390,7 @@
390390
{
391391
"data": {
392392
"text/markdown": [
393-
"**Node ID:** 924225f0-acca-4e27-9edd-41534b4a54a2<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Feminist Baby (Feminist Baby, 4)', 'price': 10.59}<br>"
393+
"**Node ID:** 5ab49fa1-e254-4031-93bd-30dd0921c4ae<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Feminist Baby (Feminist Baby, 4)', 'price': 10.59}<br>"
394394
],
395395
"text/plain": [
396396
"<IPython.core.display.Markdown object>"
@@ -402,7 +402,7 @@
402402
{
403403
"data": {
404404
"text/markdown": [
405-
"**Node ID:** 0c2201c5-8c0a-403a-875a-92903efc509f<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'price': 11.69}<br>"
405+
"**Node ID:** cc2142f6-58bf-44ee-9da5-e1df181c0d13<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'price': 11.69}<br>"
406406
],
407407
"text/plain": [
408408
"<IPython.core.display.Markdown object>"
@@ -428,7 +428,7 @@
428428
{
429429
"data": {
430430
"text/markdown": [
431-
"**Node ID:** 21ef7ca7-72b0-407f-8f30-6b02b2b91cdc<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Stables: Beautiful Paddocks, Horse Barns, and Tack Rooms', 'price': 53.1, 'review_rating': 4.7}<br>"
431+
"**Node ID:** bbb01131-a9ca-4b8f-a06a-e5b713bbe0ff<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Stables: Beautiful Paddocks, Horse Barns, and Tack Rooms', 'price': 53.1, 'review_rating': 4.7}<br>"
432432
],
433433
"text/plain": [
434434
"<IPython.core.display.Markdown object>"
@@ -440,7 +440,7 @@
440440
{
441441
"data": {
442442
"text/markdown": [
443-
"**Node ID:** e8f41cdf-af01-4134-9bf1-40a135be8ec4<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'price': 11.69, 'review_rating': 4.7}<br>"
443+
"**Node ID:** a0c8a2f9-595b-4fb4-b140-0a0bf68f436f<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'price': 11.69, 'review_rating': 4.7}<br>"
444444
],
445445
"text/plain": [
446446
"<IPython.core.display.Markdown object>"
@@ -452,7 +452,7 @@
452452
{
453453
"data": {
454454
"text/markdown": [
455-
"**Node ID:** 95f86c04-2b9b-4145-b851-be4931b9b487<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Build Your Running Body (A Total-Body Fitness Plan for All Distance Runners, from Milers to Ultramarathoners—Run Farther, Faster, and Injury-Free)', 'price': 13.49, 'review_rating': 4.7}<br>"
455+
"**Node ID:** c1b7b9c7-1ceb-43ee-b7b9-b93990352772<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Build Your Running Body (A Total-Body Fitness Plan for All Distance Runners, from Milers to Ultramarathoners—Run Farther, Faster, and Injury-Free)', 'price': 13.49, 'review_rating': 4.7}<br>"
456456
],
457457
"text/plain": [
458458
"<IPython.core.display.Markdown object>"
@@ -478,7 +478,7 @@
478478
{
479479
"data": {
480480
"text/markdown": [
481-
"**Node ID:** 15713c72-c930-4eb0-90b3-2ad617e3da6b<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Stables: Beautiful Paddocks, Horse Barns, and Tack Rooms', 'review_count': 100, 'price': 53.1}<br>"
481+
"**Node ID:** 14d164c1-9926-486e-8616-4e945a345174<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Stables: Beautiful Paddocks, Horse Barns, and Tack Rooms', 'review_count': 100, 'price': 53.1}<br>"
482482
],
483483
"text/plain": [
484484
"<IPython.core.display.Markdown object>"
@@ -490,7 +490,7 @@
490490
{
491491
"data": {
492492
"text/markdown": [
493-
"**Node ID:** 285e8590-5b71-4bd9-b81a-ae6125a9bcf4<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'review_count': 623, 'price': 11.69}<br>"
493+
"**Node ID:** b9a67ec0-7101-446d-b36c-bb8435442123<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Mindful Dragon: A Dragon Book about Mindfulness. Teach Your Dragon To Be Mindful. A Cute Children Story to Teach Kids about Mindfulness, Focus and Peace. (My Dragon Books)', 'review_count': 623, 'price': 11.69}<br>"
494494
],
495495
"text/plain": [
496496
"<IPython.core.display.Markdown object>"
@@ -502,7 +502,7 @@
502502
{
503503
"data": {
504504
"text/markdown": [
505-
"**Node ID:** 211e96fb-f435-49d3-a952-41209d479280<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Build Your Running Body (A Total-Body Fitness Plan for All Distance Runners, from Milers to Ultramarathoners—Run Farther, Faster, and Injury-Free)', 'review_count': 573, 'price': 13.49}<br>"
505+
"**Node ID:** 094e44be-bf37-4d56-9d9d-3045ffbc6004<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'Build Your Running Body (A Total-Body Fitness Plan for All Distance Runners, from Milers to Ultramarathoners—Run Farther, Faster, and Injury-Free)', 'review_count': 573, 'price': 13.49}<br>"
506506
],
507507
"text/plain": [
508508
"<IPython.core.display.Markdown object>"
@@ -514,7 +514,7 @@
514514
{
515515
"data": {
516516
"text/markdown": [
517-
"**Node ID:** 799f5a70-47c3-40af-b96a-be80fc1ccfa1<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'All Aboard! New York: A City Primer', 'review_count': 74, 'price': 9.99}<br>"
517+
"**Node ID:** 2f8c362a-f7ea-4643-8986-107d0ba7ea5b<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'All Aboard! New York: A City Primer', 'review_count': 74, 'price': 9.99}<br>"
518518
],
519519
"text/plain": [
520520
"<IPython.core.display.Markdown object>"
@@ -526,7 +526,7 @@
526526
{
527527
"data": {
528528
"text/markdown": [
529-
"**Node ID:** 38440887-0a63-4915-abe3-8fad6fd0ad10<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Rise: Creativity, the Gift of Failure, and the Search for Mastery', 'review_count': 204, 'price': 13.79}<br>"
529+
"**Node ID:** a3c0a9e4-c3b5-482e-853f-d66ac38f7787<br>**Similarity:** None<br>**Text:** <br>**Metadata:** {'title': 'The Rise: Creativity, the Gift of Failure, and the Search for Mastery', 'review_count': 204, 'price': 13.79}<br>"
530530
],
531531
"text/plain": [
532532
"<IPython.core.display.Markdown object>"

INSTALL_AND_USAGE.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ data/
9090

9191
1. Start it up:
9292
```bash
93-
make start-server
93+
make start-superlinked-server
9494
```
95+
FastAPI endpoints docs available at `http://localhost:8080/docs`
9596

9697
2. Load your data:
9798
```bash
@@ -104,4 +105,10 @@ data/
104105
make post-semantic-query # Natural language search
105106
```
106107

108+
4. Start the Streamlit UI:
109+
```bash
110+
make start-ui
111+
```
112+
Accessible at `http://localhost:8501/`
113+
107114
> 🔔 Make sure the server is running (step 1) before executing the data loading or search commands.

Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,7 @@ post-semantic-query:
3131
'http://localhost:8080/api/v1/search/semantic_query' \
3232
-H 'accept: application/json' \
3333
-H 'Content-Type: application/json' \
34-
-d '{"natural_query": "books with a price lower than 100", "limit": 3}' | jq '.'
34+
-d '{"natural_query": "books with a price lower than 100", "limit": 3}' | jq '.'
35+
36+
start-ui:
37+
uv run streamlit run tools/streamlit_app.py

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dependencies = [
1818
"zstandard>=0.23.0",
1919
"sqlalchemy>=2.0.36",
2020
"matplotlib>=3.9.3",
21+
"streamlit>=1.41.0",
2122
]
2223

2324
[dependency-groups]

superlinked_app/constants.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
TYPES = ["product", "book"]
2+
3+
CATEGORIES = [
4+
"Accessories",
5+
"Appliances",
6+
"Arts & Photography",
7+
"Arts, Crafts & Sewing",
8+
"Automotive",
9+
"Baby Care",
10+
"Baby Products",
11+
"Bath",
12+
"Beauty & Personal Care",
13+
"Bedding",
14+
"Beverages",
15+
"Biographies & Memoirs",
16+
"Books",
17+
"CDs & Vinyl",
18+
"Camera & Photo",
19+
"Cell Phones & Accessories",
20+
"Children's Books",
21+
"Christian Books & Bibles",
22+
"Classical",
23+
"Clothing, Shoes & Jewelry",
24+
"Computers & Accessories",
25+
"Costumes & Accessories",
26+
"Dogs",
27+
"Electrical",
28+
"Electronics",
29+
"Event & Party Supplies",
30+
"Exercise & Fitness",
31+
"Exterior Accessories",
32+
"GPS, Finders & Accessories",
33+
"Grocery & Gourmet Food",
34+
"Hair Care",
35+
"Health & Household",
36+
"Home & Kitchen",
37+
"Hunting & Fishing",
38+
"Industrial & Scientific",
39+
"Industrial Electrical",
40+
"Kitchen & Dining",
41+
"Lighting Assemblies & Accessories",
42+
"Lights & Lighting Accessories",
43+
"Luggage & Travel Gear",
44+
"Makeup",
45+
"Medical Supplies & Equipment",
46+
"Men",
47+
"Movies & TV",
48+
"Musical Instruments",
49+
"Office & School Supplies",
50+
"Office Products",
51+
"Patio Furniture & Accessories",
52+
"Patio, Lawn & Garden",
53+
"Pet Supplies",
54+
"Pop",
55+
"Portable Audio & Video",
56+
"Power & Hand Tools",
57+
"Raw Materials",
58+
"Replacement Parts",
59+
"Self-Help",
60+
"Sports & Outdoor Play",
61+
"Sports & Outdoors",
62+
"Stuffed Animals & Plush Toys",
63+
"Tires & Wheels",
64+
"Tools & Home Improvement",
65+
"Toy Figures & Playsets",
66+
"Toys & Games",
67+
"Vehicles",
68+
"Video Games",
69+
"Wall Art",
70+
"Women",
71+
]

superlinked_app/data_processing.py

+26
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
1+
import random
12
from typing import Optional
23

34
import pandas as pd
45

56

7+
def parse_category(category: str) -> list[str]:
8+
"""Parse the category string and return the first category.
9+
10+
Args:
11+
category: String containing the category list (e.g., "['Books', 'Fiction', 'Literature']")
12+
13+
Returns:
14+
String containing the first category, or None if parsing fails
15+
"""
16+
if isinstance(category, list) and pd.isna(category).any():
17+
return []
18+
elif not isinstance(category, list) and pd.isna(category):
19+
return []
20+
21+
try:
22+
keep_num_categories = 1 if random.random() < 0.9 else 2
23+
return [c.strip() for c in category][:keep_num_categories]
24+
except (ValueError, IndexError):
25+
return []
26+
27+
628
def parse_stars(stars: str) -> Optional[float]:
729
"""Parse the stars rating from a string to a float value between 0 and 5.
830
@@ -89,13 +111,16 @@ def process_amazon_dataset(df: pd.DataFrame) -> pd.DataFrame:
89111
- price (float): Price value
90112
"""
91113

114+
random.seed(6)
115+
92116
# Create a copy to avoid modifying the original DataFrame
93117
df_processed = df.copy()
94118

95119
# Keep only required columns
96120
columns_to_keep = [
97121
"asin",
98122
"type",
123+
"category",
99124
"title",
100125
"description",
101126
"stars",
@@ -105,6 +130,7 @@ def process_amazon_dataset(df: pd.DataFrame) -> pd.DataFrame:
105130
df_processed = df_processed[columns_to_keep]
106131

107132
# Apply transformations
133+
df_processed["category"] = df_processed["category"].apply(parse_category)
108134
df_processed["review_rating"] = df_processed["stars"].apply(parse_stars)
109135
df_processed["review_count"] = df_processed["ratings"].apply(parse_ratings)
110136
df_processed["price"] = df_processed["price"].apply(parse_price)

superlinked_app/index.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
from superlinked import framework as sl
22

3+
from superlinked_app import constants
4+
35

46
class ProductSchema(sl.Schema):
57
id: sl.IdField
68
type: sl.String
9+
category: sl.StringList
710
title: sl.String
811
description: sl.String
912
review_rating: sl.Float
@@ -13,6 +16,12 @@ class ProductSchema(sl.Schema):
1316

1417
product = ProductSchema()
1518

19+
category_space = sl.CategoricalSimilaritySpace(
20+
category_input=product.category,
21+
categories=constants.CATEGORIES,
22+
uncategorized_as_category=True,
23+
negative_filter=-1,
24+
)
1625
description_space = sl.TextSimilaritySpace(
1726
text=product.description, model="Alibaba-NLP/gte-large-en-v1.5"
1827
)
@@ -24,6 +33,11 @@ class ProductSchema(sl.Schema):
2433
)
2534

2635
product_index = sl.Index(
27-
spaces=[description_space, review_rating_maximizer_space, price_minimizer_space],
28-
fields=[product.type, product.review_rating, product.price],
36+
spaces=[
37+
category_space,
38+
description_space,
39+
review_rating_maximizer_space,
40+
price_minimizer_space,
41+
],
42+
fields=[product.type, product.category, product.review_rating, product.price],
2943
)

0 commit comments

Comments
 (0)