6 catalog data and mode mappers (#25)

* supabase product proxy and rendering

* minor pipeline refactor

* refactoring and demand estimation

* trackion of date index searching

* fixing changes of imports

* data seeding

* chore: airline basic refactor

* feat: huge push of product changes and item review with cart

* refactored design

* chore: moving route elsewhere and align

* fix: build of web/

* chore: fixing paper build

* fixing chars
This commit is contained in:
Daniel Alves Rösel
2025-11-25 11:00:31 +01:00
committed by GitHub
parent 894ce87a5d
commit 8b76d24ade
29 changed files with 1390 additions and 1237 deletions

View File

@@ -1,957 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "62eafcd9-5462-4063-8873-0e7fb9add907",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from kafka import KafkaConsumer\n",
"import pandas as pd\n",
"import json\n",
"import numpy as np\n",
"import os\n",
"from dotenv import load_dotenv\n",
"import matplotlib.pyplot as plt\n",
"from IPython.display import display, SVG, Image\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4af65cb4-e8cf-4877-b2db-13ac19f3838f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 73 entries, 0 to 72\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sessionId 73 non-null object \n",
" 1 eventName 73 non-null object \n",
" 2 page 73 non-null object \n",
" 3 productId 67 non-null object \n",
" 4 storeMode 73 non-null object \n",
" 5 userAgent 73 non-null object \n",
" 6 ts 73 non-null object \n",
" 7 metadata_referrer 6 non-null object \n",
" 8 metadata_roomType 45 non-null object \n",
" 9 metadata_price 45 non-null float64\n",
" 10 metadata_nights 45 non-null float64\n",
" 11 metadata_elementText 22 non-null object \n",
" 12 metadata_dwellTime 22 non-null float64\n",
"dtypes: float64(3), object(10)\n",
"memory usage: 7.5+ KB\n"
]
}
],
"source": [
"KAFKA_PORT=os.getenv(\"KAFKA_PORT\", 9092)\n",
"topic = \"user-interactions\"\n",
"consumer = KafkaConsumer(\n",
" topic, \n",
" enable_auto_commit=True,\n",
" value_deserializer=lambda x: json.loads(x.decode('utf-8')),\n",
" auto_offset_reset='earliest', \n",
" bootstrap_servers=['localhost:9092'])\n",
"messages=consumer.poll(timeout_ms=1000,max_records=10000)\n",
"df = []\n",
"for m in messages.values():\n",
" for i in m:\n",
" df.append(i.value)\n",
"df = pd.DataFrame(df)\n",
"# explode metadata col json\n",
"df = df.join(pd.json_normalize(df.pop(\"metadata\"), sep=\".\").add_prefix(\"metadata_\"))\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f6819a1c-32ab-49c7-845b-5df7bf60f561",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sessionId</th>\n",
" <th>eventName</th>\n",
" <th>page</th>\n",
" <th>productId</th>\n",
" <th>storeMode</th>\n",
" <th>userAgent</th>\n",
" <th>ts</th>\n",
" <th>metadata_referrer</th>\n",
" <th>metadata_roomType</th>\n",
" <th>metadata_price</th>\n",
" <th>metadata_nights</th>\n",
" <th>metadata_elementText</th>\n",
" <th>metadata_dwellTime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>d176d7c9-4027-4702-9e31-2a71395cdda0</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:23:46.270Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>page_view</td>\n",
" <td>/</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:00.291Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:07.769Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:15.010Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>269.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:15.457Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:15.591Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432</th>\n",
" <td>214d9fad-9b00-40c3-bd0e-7739b6acd654</td>\n",
" <td>click</td>\n",
" <td>1762448192425</td>\n",
" <td>DIV</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>/</td>\n",
" <td>NaN</td>\n",
" <td>1623.0</td>\n",
" <td>493.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:21.483Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:22.646Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Grand Plaza Hotel</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:25.889Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:53:59.993Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:10.705Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>223.0</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:11.771Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>416.0</td>\n",
" <td>397.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Grand Plaza Hotel</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-1</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:29.772Z</td>\n",
" <td>NaN</td>\n",
" <td>Standard Room</td>\n",
" <td>267.0</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-1</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:30.833Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Seaside Resort</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sessionId eventName page \\\n",
"0 d176d7c9-4027-4702-9e31-2a71395cdda0 page_view /products \n",
"1 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view / \n",
"2 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view /products \n",
"3 f0317a5d-e424-44e9-b784-c8f7291ffe31 view_item_page /products \n",
"4 238dc588-a7ab-4c0e-bccd-6abca5076c66 page_view /products \n",
"5 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"6 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"7 238dc588-a7ab-4c0e-bccd-6abca5076c66 hover_over_title /products \n",
"8 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"35 013fc334-4045-4d5a-8739-dd0a8766a63b page_view /products \n",
"36 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n",
"37 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n",
"38 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n",
"39 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n",
"\n",
" productId storeMode userAgent \\\n",
"0 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"1 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"2 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"3 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"4 None hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"5 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"6 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"7 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"8 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"35 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"36 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"37 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"38 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"39 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"\n",
" ts metadata_referrer metadata_roomType \\\n",
"0 2025-11-14T13:23:46.270Z NaN \n",
"1 2025-11-14T13:26:00.291Z NaN \n",
"2 2025-11-14T13:26:07.769Z NaN \n",
"3 2025-11-14T13:26:15.010Z NaN Premium Room \n",
"4 2025-11-14T13:27:15.457Z NaN \n",
"5 2025-11-14T13:27:15.591Z NaN Premium Room \n",
"6 2025-11-14T13:27:21.483Z NaN Premium Room \n",
"7 2025-11-14T13:27:22.646Z NaN NaN \n",
"8 2025-11-14T13:27:25.889Z NaN Premium Room \n",
"35 2025-11-14T13:53:59.993Z NaN \n",
"36 2025-11-14T13:54:10.705Z NaN Premium Room \n",
"37 2025-11-14T13:54:11.771Z NaN NaN \n",
"38 2025-11-14T13:54:29.772Z NaN Standard Room \n",
"39 2025-11-14T13:54:30.833Z NaN NaN \n",
"\n",
" metadata_price metadata_nights metadata_elementText metadata_dwellTime \n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 269.0 1.0 NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"5 264.0 2.0 NaN NaN \n",
"6 264.0 2.0 NaN NaN \n",
"7 NaN NaN Grand Plaza Hotel 1200.0 \n",
"8 264.0 2.0 NaN NaN \n",
"35 NaN NaN NaN NaN \n",
"36 223.0 3.0 NaN NaN \n",
"37 NaN NaN Grand Plaza Hotel 1200.0 \n",
"38 267.0 5.0 NaN NaN \n",
"39 NaN NaN Seaside Resort 1200.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('sessionId').head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "380eca5f-8304-4fb2-be32-e8bcfd312085",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['013fc334-4045-4d5a-8739-dd0a8766a63b',\n",
" '238dc588-a7ab-4c0e-bccd-6abca5076c66',\n",
" 'd176d7c9-4027-4702-9e31-2a71395cdda0',\n",
" 'f0317a5d-e424-44e9-b784-c8f7291ffe31']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sessions = list(set(df['sessionId'])); sessions # 238dc588-a7ab-4c0e-bccd-6abca5076c66"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f4ae6f81-dcb8-44be-aee7-30dbc3a6bae1",
"metadata": {},
"outputs": [],
"source": [
"# map sessions to experiments"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "050d90a4-20a9-47f5-b998-c31178a54cb3",
"metadata": {},
"outputs": [],
"source": [
"def build_transition_prob_matrix(df: pd.DataFrame):\n",
" df = df.dropna(subset=['eventName'])\n",
" events = df['eventName'].tolist()\n",
" labels = pd.Index(events).unique().tolist()\n",
" idx = {e:i for i,e in enumerate(labels)}\n",
" M = np.zeros((len(labels), len(labels)), dtype=float)\n",
" for a, b in zip(events, events[1:]):\n",
" M[idx[a], idx[b]] += 1\n",
" row_sums = M.sum(axis=1, keepdims=True)\n",
" with np.errstate(divide='ignore', invalid='ignore'):\n",
" P = np.divide(M, row_sums, where=row_sums>0) # row-normalized\n",
" return P, labels"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e68f9004-82f5-4826-aece-e3dc6e15a18f",
"metadata": {},
"outputs": [],
"source": [
"# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b\n",
"from graphviz import Digraph\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def _as_prob_df(matrix, labels=None):\n",
" \"\"\"Return a square DataFrame with index=columns=labels.\"\"\"\n",
" if isinstance(matrix, pd.DataFrame):\n",
" # Ensure square and aligned\n",
" assert (matrix.index == matrix.columns).all(), \"Index/columns must match.\"\n",
" return matrix\n",
" matrix = np.asarray(matrix, dtype=float)\n",
" assert matrix.shape[0] == matrix.shape[1], \"Matrix must be square.\"\n",
" if labels is None:\n",
" raise ValueError(\"labels are required when matrix is not a DataFrame\")\n",
" assert len(labels) == matrix.shape[0], \"labels length must match matrix size.\"\n",
" return pd.DataFrame(matrix, index=list(labels), columns=list(labels))\n",
"\n",
"def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2):\n",
" \"\"\"Build weighted edges > threshold.\"\"\"\n",
" edges = []\n",
" for src in P.index:\n",
" for dst in P.columns:\n",
" w = float(P.loc[src, dst])\n",
" if w > threshold:\n",
" edges.append((str(src), str(dst), f\"{w:.{round_digits}f}\"))\n",
" return edges\n",
"\n",
"def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt=\"svg\", view=False):\n",
" \"\"\"\n",
" fname: output file stem (no extension)\n",
" matrix: NumPy array or pandas DataFrame of transition PROBABILITIES\n",
" ls_index: ordered labels (required if matrix is not a DataFrame)\n",
" threshold: hide edges with weight <= threshold\n",
" fmt: 'svg'|'png'|'pdf' etc.\n",
" view: open after rendering\n",
" \"\"\"\n",
" P = _as_prob_df(matrix, labels=ls_index)\n",
" edges = _df_to_edgelist(P, threshold=threshold)\n",
"\n",
" g = Digraph(format=fmt)\n",
" g.attr(rankdir=\"LR\", size=\"30\")\n",
" g.attr(\"node\", shape=\"circle\")\n",
"\n",
" # ensure isolated nodes appear\n",
" for node in P.index:\n",
" g.node(str(node), width=\"1\", height=\"1\")\n",
"\n",
" for src, dst, label in edges:\n",
" g.edge(src, dst, label=label)\n",
"\n",
" g.render(fname, view=view, cleanup=True)\n",
" return g\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e255a2c1-6454-4e5e-89f6-ef8ac51ab6cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"013fc334-4045-4d5a-8739-dd0a8766a63b\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"565pt\" height=\"354pt\"\n",
" viewBox=\"0.00 0.00 565.00 354.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 349.64)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-349.64 561.05,-349.64 561.05,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-235.83\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-231.16\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-235.83\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-231.16\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M96.71,-235.83C113.69,-235.83 133.31,-235.83 152.25,-235.83\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"152.1,-239.33 162.1,-235.83 152.1,-232.33 152.1,-239.33\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-239.78\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M214.74,-302.59C217.1,-314.51 223.14,-322.84 232.88,-322.84 239.27,-322.84 244.07,-319.26 247.28,-313.42\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"250.57,-314.62 250.52,-304.02 243.95,-312.33 250.57,-314.62\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-326.79\" font-family=\"Times,serif\" font-size=\"14.00\">0.68</text>\n",
"</g>\n",
"<!-- hover_over_title -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>hover_over_title</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-275.83\" rx=\"69.81\" ry=\"69.81\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-271.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_title</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_title -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_title</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M300.48,-250.14C307.03,-251.43 313.58,-252.69 319.89,-253.83 340.12,-257.51 362.05,-261.1 382.5,-264.27\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"381.77,-267.7 392.19,-265.76 382.83,-260.78 381.77,-267.7\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-263.17\" font-family=\"Times,serif\" font-size=\"14.00\">0.29</text>\n",
"</g>\n",
"<!-- hover_over_paragraph -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>hover_over_paragraph</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-93.83\" rx=\"93.83\" ry=\"93.83\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-89.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_paragraph</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_paragraph -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_paragraph</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M292.09,-199.63C316.79,-184.27 346.14,-166.02 373.44,-149.04\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"375.08,-152.15 381.72,-143.89 371.38,-146.2 375.08,-152.15\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-185.68\" font-family=\"Times,serif\" font-size=\"14.00\">0.04</text>\n",
"</g>\n",
"<!-- hover_over_title&#45;&gt;view_item_page -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>hover_over_title&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M399.53,-246.73C384.12,-240.88 367.42,-235.6 351.39,-232.58 339.13,-230.28 326.03,-229.26 313.19,-229.04\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"313.51,-225.54 303.51,-229.04 313.51,-232.54 313.51,-225.54\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-236.53\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f0779e818b0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"8pt\" height=\"8pt\"\n",
" viewBox=\"0.00 0.00 8.00 8.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 4)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-4 4,-4 4,4 -4,4\"/>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800fac980>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n",
" [0.00000000e+000 6.78571429e-001 2.85714286e-001 3.57142857e-002]\n",
" [0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n",
" [2.05833592e-312 2.29175545e-312 4.94065646e-324 6.92110218e-310]]\n",
"238dc588-a7ab-4c0e-bccd-6abca5076c66\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"565pt\" height=\"354pt\"\n",
" viewBox=\"0.00 0.00 565.00 354.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 349.64)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-349.64 561.05,-349.64 561.05,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-109.83\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-105.16\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-197.83\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-193.16\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M92.02,-130.47C112.32,-140.25 137.13,-152.2 160.18,-163.3\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"158.39,-166.32 168.92,-167.51 161.43,-160.02 158.39,-166.32\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-157.78\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M214.74,-264.59C217.1,-276.51 223.14,-284.84 232.88,-284.84 239.27,-284.84 244.07,-281.26 247.28,-275.42\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"250.57,-276.62 250.52,-266.02 243.95,-274.33 250.57,-276.62\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-288.79\" font-family=\"Times,serif\" font-size=\"14.00\">0.19</text>\n",
"</g>\n",
"<!-- hover_over_title -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>hover_over_title</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-275.83\" rx=\"69.81\" ry=\"69.81\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-271.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_title</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_title -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_title</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M289.6,-237.16C299.36,-242.77 309.67,-247.94 319.89,-251.83 339.45,-259.28 361.4,-264.43 382.1,-267.98\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"381.52,-271.43 391.95,-269.55 382.62,-264.52 381.52,-271.43\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-265.16\" font-family=\"Times,serif\" font-size=\"14.00\">0.38</text>\n",
"</g>\n",
"<!-- hover_over_paragraph -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>hover_over_paragraph</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-93.83\" rx=\"93.83\" ry=\"93.83\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-89.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_paragraph</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_paragraph -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_paragraph</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M300.22,-180.71C317.22,-175.46 335.24,-169.12 351.39,-161.83 358.97,-158.41 366.67,-154.57 374.29,-150.49\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"375.84,-153.63 382.92,-145.75 372.47,-147.5 375.84,-153.63\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-178.15\" font-family=\"Times,serif\" font-size=\"14.00\">0.44</text>\n",
"</g>\n",
"<!-- hover_over_title&#45;&gt;view_item_page -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>hover_over_title&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M398.52,-248.36C383.21,-242.16 366.82,-235.87 351.39,-230.58 338.42,-226.15 324.5,-221.86 310.94,-217.93\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"312.2,-214.65 301.62,-215.28 310.28,-221.39 312.2,-214.65\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-234.53\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- hover_over_paragraph&#45;&gt;page_view -->\n",
"<g id=\"edge6\" class=\"edge\">\n",
"<title>hover_over_paragraph&#45;&gt;page_view</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M369.13,-95.76C310.26,-97.17 232.59,-99.41 163.87,-102.58 145.72,-103.42 125.98,-104.58 108.06,-105.73\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"107.86,-102.24 98.1,-106.38 108.31,-109.22 107.86,-102.24\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-106.53\" font-family=\"Times,serif\" font-size=\"14.00\">0.14</text>\n",
"</g>\n",
"<!-- hover_over_paragraph&#45;&gt;view_item_page -->\n",
"<g id=\"edge7\" class=\"edge\">\n",
"<title>hover_over_paragraph&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M372.68,-119.15C354.84,-125.32 336.5,-132.51 319.89,-140.58 312.9,-143.98 305.81,-147.87 298.86,-151.98\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"297.49,-148.71 290.78,-156.91 301.14,-154.69 297.49,-148.71\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-144.53\" font-family=\"Times,serif\" font-size=\"14.00\">0.86</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800f97110>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 1. 0. 0. ]\n",
" [0. 0.1875 0.375 0.4375 ]\n",
" [0. 1. 0. 0. ]\n",
" [0.14285714 0.85714286 0. 0. ]]\n",
"d176d7c9-4027-4702-9e31-2a71395cdda0\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"104pt\" height=\"104pt\"\n",
" viewBox=\"0.00 0.00 104.00 104.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 100.37)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-100.37 100.37,-100.37 100.37,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-48.19\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-43.51\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800f97110>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.]]\n",
"f0317a5d-e424-44e9-b784-c8f7291ffe31\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"310pt\" height=\"160pt\"\n",
" viewBox=\"0.00 0.00 310.00 160.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 156.44)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-156.44 305.89,-156.44 305.89,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-69.01\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-64.33\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;page_view -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;page_view</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M33.03,-115.09C34.09,-126.6 39.14,-135.19 48.19,-135.19 53.98,-135.19 58.13,-131.66 60.65,-126.1\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"64.01,-127.11 62.98,-116.56 57.21,-125.45 64.01,-127.11\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-139.14\" font-family=\"Times,serif\" font-size=\"14.00\">0.50</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-69.01\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-64.33\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M96.71,-69.01C113.69,-69.01 133.31,-69.01 152.25,-69.01\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"152.1,-72.51 162.1,-69.01 152.1,-65.51 152.1,-72.51\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-72.96\" font-family=\"Times,serif\" font-size=\"14.00\">0.50</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800bf50f0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[5.0e-001 5.0e-001]\n",
" [9.9e-324 1.5e-323]]\n"
]
}
],
"source": [
"def explore_session(session_id: str):\n",
" subset = df[df['sessionId'] == session_id]\n",
" print(session_id)\n",
" P, labels = build_transition_prob_matrix(subset)\n",
" g = render_graph(f\"session_{session_id}\", P, ls_index=labels, threshold=0.01, fmt=\"svg\", view=False)\n",
" display(g)\n",
" return P\n",
"for session in sessions:\n",
" print(explore_session(session))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (PHANTOM)",
"language": "python",
"name": "phantom"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,19 @@
from .extract import (
KafkaDataFetcher,
ExperimentJoiner,
EventTitleAugmenter,
)
from .demand import DemandEstimator
from .mapping import SessionTransitionProbMatrixTransformer, render_graph
from .pipeline import etl_pipeline, pricing_pipeline
__all__ = [
'KafkaDataFetcher',
'ExperimentJoiner',
'EventTitleAugmenter',
'DemandEstimator',
'SessionTransitionProbMatrixTransformer',
'render_graph',
'etl_pipeline',
'pricing_pipeline',
]

View File

@@ -0,0 +1,39 @@
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from supabase import create_client, Client
import pandas as pd
import os
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
class DemandEstimator(BaseEstimator, TransformerMixin):
def __init__(self,
store_mode:str='hotel',
session_filter:str="",
experiment_filter:str=""):
self.store=store_mode
self.session_filter=session_filter if len(session_filter)>0 else None
self.experiment_filter=experiment_filter if len(experiment_filter)>0 else None
def fit(self, X):
return self
def transform(self, interactions : pd.DataFrame):
if interactions.empty:
return pd.DataFrame(columns=["productId", "demand_score"])
if self.session_filter:
interactions = interactions[interactions['sessionId'] == self.session_filter]
if self.experiment_filter:
interactions = interactions[interactions['experimentId'] == self.experiment_filter]
products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
products = pd.DataFrame(products.data)
unique_products = products['id'].unique()
# TODO: improve demand score calculation rather than just counting interactions (use weights..)
# while maintaining simplicity of a simple cross tab approach
product_demand = pd.crosstab(interactions['productId'], "no_of_interactions")
product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
product_demand.columns = ['productId', 'demand_score']
return product_demand

View File

@@ -15,106 +15,98 @@ N_PRICE_BUCKETS = 5
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
def get_data_from_kafka() -> pd.DataFrame:
"""fetch all events from backend dump endpoint"""
resp = requests.get(f"{BACKEND_URL}/api/kafka/dump")
resp.raise_for_status()
data = resp.json()
if not data.get('success') or not data.get('data'):
return pd.DataFrame()
df = pd.DataFrame(data['data'])
# explode metadata col json
if 'metadata' in df.columns:
df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
df = df.dropna(subset=['eventName'])
return df
def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame:
if df.empty or 'experimentId' not in df.columns:
return df
unique_exp_ids = df['experimentId'].dropna().unique()
if len(unique_exp_ids) == 0:
return df
resp = supabase.table('experiments').select(
'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, task:tasks(task_name, task_description, task_def_of_done)'
).in_('id', unique_exp_ids.tolist()).execute()
if not resp.data:
return df
exp_df = pd.DataFrame(resp.data)
# flatten task nested object if present
if 'task' in exp_df.columns and exp_df['task'].notnull().any():
task_normalized = pd.json_normalize(exp_df['task'].dropna())
task_normalized.index = exp_df[exp_df['task'].notnull()].index
exp_df = exp_df.drop(columns=['task']).join(task_normalized, rsuffix='_task')
# rename experiment columns for clarity
exp_df = exp_df.rename(columns={
'id': 'experimentId',
'subject_name': 'exp_subject',
'xp_human_only': 'exp_human_only',
'xp_market_mode': 'exp_market_mode',
'xp_task_id': 'exp_task_id'
})
df = df.merge(exp_df, on='experimentId', how='left')
return df
def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame:
# from taking standard view_item_page in eventName to view_item_page_{metadata_schema}
# we want metadata schema to create product specific event names
# only create price buckets if we have enough unique prices
if df["metadata_price"].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df["metadata_price"],
q=N_PRICE_BUCKETS,
labels=[f"PB_{i+1}" for i in range(N_PRICE_BUCKETS)],
duplicates='drop' # handle duplicate bin edges
)
except ValueError:
# fallback: if still not enough unique values, use cut with fixed ranges or just use raw price
price_buckets = df["metadata_price"].apply(lambda x: f"P_{int(x)}" if pd.notnull(x) else "")
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
# metadata_schema: _product_id@price_bucket_{i} only if we have product metadata otherswise keep original event name
# TODO: make this adaptive, if we have hover_over_title we append the title, if its view_page we say which page
df["metadata_schema"] = np.where(
df["productId"].notnull() & df["metadata_price"].notnull(),
"_" + df["productId"].astype(str) + "@" + price_buckets.astype(str),
""
)
df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
return df
def extract() -> pd.DataFrame:
df = get_data_from_kafka()
df = join_with_experiments(df)
df = augment_event_titles(df)
return df
class DataExtractor(BaseEstimator, TransformerMixin):
class KafkaDataFetcher(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
return extract()
resp = requests.get(f"{BACKEND_URL}/api/kafka/dump")
resp.raise_for_status()
data = resp.json()
if not data.get('success') or not data.get('data'):
return pd.DataFrame()
df = pd.DataFrame(data['data'])
# explode metadata col json
if 'metadata' in df.columns:
df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
df = df.dropna(subset=['eventName'])
# remape dateIndex
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df
if __name__ == "__main__":
df = extract()
print(df.head())
print(df.tail())
print(df.info())
class ExperimentJoiner(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
if df.empty or 'experimentId' not in df.columns:
return df
unique_exp_ids = df['experimentId'].dropna().unique()
if len(unique_exp_ids) == 0:
return df
resp = supabase.table('experiments').select(
'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, task:tasks(task_name, task_description, task_def_of_done)'
).in_('id', unique_exp_ids.tolist()).execute()
if not resp.data:
return df
exp_df = pd.DataFrame(resp.data)
# flatten task nested object if present
if 'task' in exp_df.columns and exp_df['task'].notnull().any():
task_normalized = pd.json_normalize(exp_df['task'].dropna())
task_normalized.index = exp_df[exp_df['task'].notnull()].index
exp_df = exp_df.drop(columns=['task']).join(task_normalized, rsuffix='_task')
# rename experiment columns for clarity
exp_df = exp_df.rename(columns={
'id': 'experimentId',
'subject_name': 'exp_subject',
'xp_human_only': 'exp_human_only',
'xp_market_mode': 'exp_market_mode',
'xp_task_id': 'exp_task_id'
})
df = df.merge(exp_df, on='experimentId', how='left')
return df
class EventTitleAugmenter(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
# from taking standard view_item_page in eventName to view_item_page_{metadata_schema}
# we want metadata schema to create product specific event names
# only create price buckets if we have enough unique prices
if df["metadata_price"].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df["metadata_price"],
q=N_PRICE_BUCKETS,
labels=[f"PB_{i+1}" for i in range(N_PRICE_BUCKETS)],
duplicates='drop' # handle duplicate bin edges
)
except ValueError:
# fallback: if still not enough unique values, use cut with fixed ranges or just use raw price
price_buckets = df["metadata_price"].apply(lambda x: f"P_{int(x)}" if pd.notnull(x) else "")
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
# metadata_schema: _product_id@price_bucket_{i} only if we have product metadata otherswise keep original event name
# TODO: make this adaptive, if we have hover_over_title we append the title, if its view_page we say which page
df["metadata_schema"] = np.where(
df["productId"].notnull() & df["metadata_price"].notnull(),
"_" + df["productId"].astype(str) + "@" + price_buckets.astype(str),
""
)
df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
return df

View File

@@ -1,15 +1,22 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from extract import DataExtractor
from mapping import SessionTransitionProbMatrixTransformer, render_graph
from extract import KafkaDataFetcher, ExperimentJoiner, EventTitleAugmenter
from mapping import SessionTransitionProbMatrixTransformer, render_graph
from demand import DemandEstimator
# exposable pipelines
etl_pipeline = Pipeline([
('kafka_fetch', KafkaDataFetcher()),
('experiment_join', ExperimentJoiner()),
('event_augment', EventTitleAugmenter()),
])
pricing_pipeline = Pipeline([
('demand_estimation', DemandEstimator()),
])
if __name__ == "__main__":
steps = [
('data_extraction', DataExtractor()),
#('transition_matrix', SessionTransitionProbMatrixTransformer(threshold=0.05)),
]
pipeline = Pipeline(steps)
result = pipeline.fit_transform(None)
print(result)
print(result.info())
processed_data = etl_pipeline.fit_transform(None)
pricing = pricing_pipeline.fit_transform(processed_data)
print(pricing)

View File

@@ -0,0 +1,125 @@
import random
import json
import os
import logging
from dotenv import load_dotenv
from supabase import create_client, Client
from tqdm import tqdm
load_dotenv()
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
log = logging.getLogger(__name__)
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
if not SUPABASE_SERVICE_KEY:
log.error("SUPABASE_SERVICE_ROLE_KEY not found in environment")
raise ValueError("Missing SUPABASE_SERVICE_ROLE_KEY - required for admin operations")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
DAYS = 14
# hotel room configurations
ROOMS = {
"Presidential Suite": {'amenities': ['ocean_view', 'balcony', 'jacuzzi', 'butler_service', 'premium_minibar'], 'total': 1, 'image_url': "", "base_price": 450, 'name': 'Presidential Suite', 'refundable': True, 'max_occupancy': 4},
"Executive Suite": {'amenities': ['city_view', 'balcony', 'workspace', 'lounge_access'], 'total': 2, 'image_url': "", "base_price": 280, 'name': 'Executive Suite', 'refundable': True, 'max_occupancy': 3},
"Junior Suite": {'amenities': ['garden_view', 'mini_fridge', 'coffee_maker'], 'total': 5, 'image_url': "", "base_price": 180, 'name': 'Junior Suite', 'refundable': True, 'max_occupancy': 2},
"Deluxe Room": {'amenities': ['city_view', 'work_desk', 'coffee_maker'], 'total': 8, 'image_url': "", "base_price": 140, 'name': 'Deluxe Room', 'refundable': False, 'max_occupancy': 2},
"Superior Room": {'amenities': ['wifi', 'tv', 'safe'], 'total': 12, 'image_url': "", "base_price": 110, 'name': 'Superior Room', 'refundable': False, 'max_occupancy': 2},
"Standard Room": {'amenities': ['wifi', 'tv'], 'total': 20, 'image_url': "", "base_price": 85, 'name': 'Standard Room', 'refundable': False, 'max_occupancy': 2},
}
# flight configurations
FLIGHTS = {
"JFK-LAX-Economy": {'departure': {'time': '08:00', 'airport': 'JFK'}, 'arrival': {'time': '11:30', 'airport': 'LAX'}, 'duration': '5h 30m', 'stops': 0, 'cabin_class': 'economy', 'fare_rule': 'standard', 'refundable': False, 'total': 180, 'base_price': 250},
"JFK-LAX-Business": {'departure': {'time': '08:00', 'airport': 'JFK'}, 'arrival': {'time': '11:30', 'airport': 'LAX'}, 'duration': '5h 30m', 'stops': 0, 'cabin_class': 'business', 'fare_rule': 'flexible', 'refundable': True, 'total': 30, 'base_price': 850},
"ORD-MIA-Economy": {'departure': {'time': '14:15', 'airport': 'ORD'}, 'arrival': {'time': '18:45', 'airport': 'MIA'}, 'duration': '3h 30m', 'stops': 0, 'cabin_class': 'economy', 'fare_rule': 'basic', 'refundable': False, 'total': 200, 'base_price': 180},
"SFO-SEA-Premium": {'departure': {'time': '06:30', 'airport': 'SFO'}, 'arrival': {'time': '08:45', 'airport': 'SEA'}, 'duration': '2h 15m', 'stops': 0, 'cabin_class': 'premium', 'fare_rule': 'standard', 'refundable': False, 'total': 60, 'base_price': 420},
"ATL-DFW-First": {'departure': {'time': '16:00', 'airport': 'ATL'}, 'arrival': {'time': '17:30', 'airport': 'DFW'}, 'duration': '2h 30m', 'stops': 0, 'cabin_class': 'first', 'fare_rule': 'flexible', 'refundable': True, 'total': 12, 'base_price': 1600},
"LAX-SFO-Economy": {'departure': {'time': '10:00', 'airport': 'LAX'}, 'arrival': {'time': '11:30', 'airport': 'SFO'}, 'duration': '1h 30m', 'stops': 0, 'cabin_class': 'economy', 'fare_rule': 'standard', 'refundable': False, 'total': 150, 'base_price': 120},
"MIA-ATL-Premium": {'departure': {'time': '19:00', 'airport': 'MIA'}, 'arrival': {'time': '20:45', 'airport': 'ATL'}, 'duration': '1h 45m', 'stops': 0, 'cabin_class': 'premium', 'fare_rule': 'standard', 'refundable': True, 'total': 50, 'base_price': 380},
"DFW-ORD-Economy": {'departure': {'time': '07:30', 'airport': 'DFW'}, 'arrival': {'time': '10:15', 'airport': 'ORD'}, 'duration': '2h 45m', 'stops': 0, 'cabin_class': 'economy', 'fare_rule': 'basic', 'refundable': False, 'total': 190, 'base_price': 160},
"SEA-LAX-Business": {'departure': {'time': '13:00', 'airport': 'SEA'}, 'arrival': {'time': '15:30', 'airport': 'LAX'}, 'duration': '2h 30m', 'stops': 0, 'cabin_class': 'business', 'fare_rule': 'flexible', 'refundable': True, 'total': 40, 'base_price': 720},
"LAX-JFK-First": {'departure': {'time': '18:00', 'airport': 'LAX'}, 'arrival': {'time': '02:15', 'airport': 'JFK'}, 'duration': '5h 15m', 'stops': 0, 'cabin_class': 'first', 'fare_rule': 'flexible', 'refundable': True, 'total': 16, 'base_price': 1850},
}
def gen_hotel_products():
"""generate hotel room products for next DAYS days"""
data = []
for day in range(DAYS):
for room_type, rdata in ROOMS.items():
data.append({
'room_type': room_type,
'date_index': day + 1,
'metadata': rdata,
'availability': random.randint(0, rdata['total'])
})
return data
def gen_airline_products():
"""generate flight products for next DAYS days"""
data = []
for day in range(DAYS):
for flight_type, fdata in FLIGHTS.items():
data.append({
'flight_type': flight_type,
'date_index': day + 1,
'metadata': fdata,
'availability': random.randint(0, fdata['total'])
})
return data
def clear_table(table_name: str):
"""clear all records from a table"""
try:
resp = supabase.table(table_name).select('id').execute()
if resp.data:
ids = [row['id'] for row in resp.data]
chunk_size = 100
for i in tqdm(range(0, len(ids), chunk_size), desc=f"Clearing {table_name}", unit="chunk"):
chunk = ids[i:i+chunk_size]
supabase.table(table_name).delete().in_('id', chunk).execute()
log.info(f"Deleted {len(ids)} records from {table_name}")
else:
log.info(f"{table_name} already empty")
except Exception as e:
log.error(f"Failed to clear {table_name}: {e}")
raise
def seed_table(table_name: str, data: list[dict]):
"""insert records into a table"""
try:
chunk_size = 100
total = len(data)
for i in tqdm(range(0, total, chunk_size), desc=f"Seeding {table_name}", unit="chunk"):
chunk = data[i:i+chunk_size]
supabase.table(table_name).insert(chunk).execute()
log.info(f"Inserted {total} records into {table_name}")
except Exception as e:
log.error(f"Failed to seed {table_name}: {e}")
raise
def main():
log.info("Generating hotel products...")
hotel_products = gen_hotel_products()
log.info(f"Generated {len(hotel_products)} hotel products")
log.info("Generating airline products...")
airline_products = gen_airline_products()
log.info(f"Generated {len(airline_products)} airline products\n")
log.info("Clearing existing products...")
clear_table('hotel_products')
clear_table('airline_products')
log.info("Seeding products...")
seed_table('hotel_products', hotel_products)
seed_table('airline_products', airline_products)
if __name__ == "__main__":
main()