{ "cells": [ { "cell_type": "code", "execution_count": 51, "id": "62eafcd9-5462-4063-8873-0e7fb9add907", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from kafka import KafkaConsumer\n", "import pandas as pd\n", "import json\n", "import numpy as np\n", "import os\n", "from dotenv import load_dotenv\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display, SVG, Image\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 52, "id": "4af65cb4-e8cf-4877-b2db-13ac19f3838f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 73 entries, 0 to 72\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sessionId 73 non-null object \n", " 1 eventName 73 non-null object \n", " 2 page 73 non-null object \n", " 3 productId 67 non-null object \n", " 4 storeMode 73 non-null object \n", " 5 userAgent 73 non-null object \n", " 6 ts 73 non-null object \n", " 7 metadata_referrer 6 non-null object \n", " 8 metadata_roomType 45 non-null object \n", " 9 metadata_price 45 non-null float64\n", " 10 metadata_nights 45 non-null float64\n", " 11 metadata_elementText 22 non-null object \n", " 12 metadata_dwellTime 22 non-null float64\n", "dtypes: float64(3), object(10)\n", "memory usage: 7.5+ KB\n" ] } ], "source": [ "KAFKA_PORT=os.getenv(\"KAFKA_PORT\", 9092)\n", "topic = \"user-interactions\"\n", "consumer = KafkaConsumer(\n", " topic, \n", " enable_auto_commit=True,\n", " value_deserializer=lambda x: json.loads(x.decode('utf-8')),\n", " auto_offset_reset='earliest', \n", " bootstrap_servers=['localhost:9092'])\n", "messages=consumer.poll(timeout_ms=1000,max_records=10000)\n", "df = []\n", "for m in messages.values():\n", " for i in m:\n", " df.append(i.value)\n", "df = pd.DataFrame(df)\n", "# explode metadata col json\n", "df = df.join(pd.json_normalize(df.pop(\"metadata\"), sep=\".\").add_prefix(\"metadata_\"))\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 53, "id": "f6819a1c-32ab-49c7-845b-5df7bf60f561", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sessionIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_roomTypemetadata_pricemetadata_nightsmetadata_elementTextmetadata_dwellTime
0d176d7c9-4027-4702-9e31-2a71395cdda0page_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:23:46.270ZNaNNaNNaNNaNNaN
1f0317a5d-e424-44e9-b784-c8f7291ffe31page_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:00.291ZNaNNaNNaNNaNNaN
2f0317a5d-e424-44e9-b784-c8f7291ffe31page_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:07.769ZNaNNaNNaNNaNNaN
3f0317a5d-e424-44e9-b784-c8f7291ffe31view_item_page/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:15.010ZNaNPremium Room269.01.0NaNNaN
4238dc588-a7ab-4c0e-bccd-6abca5076c66page_view/productsNonehotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:15.457ZNaNNaNNaNNaNNaN
5238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:15.591ZNaNPremium Room264.02.0NaNNaN
6238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:21.483ZNaNPremium Room264.02.0NaNNaN
7238dc588-a7ab-4c0e-bccd-6abca5076c66hover_over_title/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:22.646ZNaNNaNNaNNaNGrand Plaza Hotel1200.0
8238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:25.889ZNaNPremium Room264.02.0NaNNaN
35013fc334-4045-4d5a-8739-dd0a8766a63bpage_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:53:59.993ZNaNNaNNaNNaNNaN
36013fc334-4045-4d5a-8739-dd0a8766a63bview_item_page/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:10.705ZNaNPremium Room223.03.0NaNNaN
37013fc334-4045-4d5a-8739-dd0a8766a63bhover_over_title/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:11.771ZNaNNaNNaNNaNGrand Plaza Hotel1200.0
38013fc334-4045-4d5a-8739-dd0a8766a63bview_item_page/productshtl-1hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:29.772ZNaNStandard Room267.05.0NaNNaN
39013fc334-4045-4d5a-8739-dd0a8766a63bhover_over_title/productshtl-1hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:30.833ZNaNNaNNaNNaNSeaside Resort1200.0
\n", "
" ], "text/plain": [ " sessionId eventName page \\\n", "0 d176d7c9-4027-4702-9e31-2a71395cdda0 page_view /products \n", "1 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view / \n", "2 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view /products \n", "3 f0317a5d-e424-44e9-b784-c8f7291ffe31 view_item_page /products \n", "4 238dc588-a7ab-4c0e-bccd-6abca5076c66 page_view /products \n", "5 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", "6 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", "7 238dc588-a7ab-4c0e-bccd-6abca5076c66 hover_over_title /products \n", "8 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", "35 013fc334-4045-4d5a-8739-dd0a8766a63b page_view /products \n", "36 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", "37 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", "38 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", "39 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", "\n", " productId storeMode userAgent \\\n", "0 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "1 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", "2 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", "3 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", "4 None hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", "5 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", "6 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", "7 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", "8 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", "35 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "36 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "37 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "38 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "39 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "\n", " ts metadata_referrer metadata_roomType \\\n", "0 2025-11-14T13:23:46.270Z NaN \n", "1 2025-11-14T13:26:00.291Z NaN \n", "2 2025-11-14T13:26:07.769Z NaN \n", "3 2025-11-14T13:26:15.010Z NaN Premium Room \n", "4 2025-11-14T13:27:15.457Z NaN \n", "5 2025-11-14T13:27:15.591Z NaN Premium Room \n", "6 2025-11-14T13:27:21.483Z NaN Premium Room \n", "7 2025-11-14T13:27:22.646Z NaN NaN \n", "8 2025-11-14T13:27:25.889Z NaN Premium Room \n", "35 2025-11-14T13:53:59.993Z NaN \n", "36 2025-11-14T13:54:10.705Z NaN Premium Room \n", "37 2025-11-14T13:54:11.771Z NaN NaN \n", "38 2025-11-14T13:54:29.772Z NaN Standard Room \n", "39 2025-11-14T13:54:30.833Z NaN NaN \n", "\n", " metadata_price metadata_nights metadata_elementText metadata_dwellTime \n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 269.0 1.0 NaN NaN \n", "4 NaN NaN NaN NaN \n", "5 264.0 2.0 NaN NaN \n", "6 264.0 2.0 NaN NaN \n", "7 NaN NaN Grand Plaza Hotel 1200.0 \n", "8 264.0 2.0 NaN NaN \n", "35 NaN NaN NaN NaN \n", "36 223.0 3.0 NaN NaN \n", "37 NaN NaN Grand Plaza Hotel 1200.0 \n", "38 267.0 5.0 NaN NaN \n", "39 NaN NaN Seaside Resort 1200.0 " ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('sessionId').head()" ] }, { "cell_type": "code", "execution_count": 54, "id": "380eca5f-8304-4fb2-be32-e8bcfd312085", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['238dc588-a7ab-4c0e-bccd-6abca5076c66',\n", " 'f0317a5d-e424-44e9-b784-c8f7291ffe31',\n", " 'd176d7c9-4027-4702-9e31-2a71395cdda0',\n", " '013fc334-4045-4d5a-8739-dd0a8766a63b']" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sessions = list(set(df['sessionId'])); sessions # 238dc588-a7ab-4c0e-bccd-6abca5076c66" ] }, { "cell_type": "code", "execution_count": 55, "id": "f4ae6f81-dcb8-44be-aee7-30dbc3a6bae1", "metadata": {}, "outputs": [], "source": [ "# map sessions to experiments" ] }, { "cell_type": "code", "execution_count": 56, "id": "050d90a4-20a9-47f5-b998-c31178a54cb3", "metadata": {}, "outputs": [], "source": [ "def build_transition_prob_matrix(df: pd.DataFrame):\n", " df = df.dropna(subset=['eventName'])\n", " events = df['eventName'].tolist()\n", " labels = pd.Index(events).unique().tolist()\n", " idx = {e:i for i,e in enumerate(labels)}\n", " M = np.zeros((len(labels), len(labels)), dtype=float)\n", " for a, b in zip(events, events[1:]):\n", " M[idx[a], idx[b]] += 1\n", " row_sums = M.sum(axis=1, keepdims=True)\n", " with np.errstate(divide='ignore', invalid='ignore'):\n", " P = np.divide(M, row_sums, where=row_sums>0) # row-normalized\n", " return P, labels" ] }, { "cell_type": "code", "execution_count": 57, "id": "e68f9004-82f5-4826-aece-e3dc6e15a18f", "metadata": {}, "outputs": [], "source": [ "# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b\n", "from graphviz import Digraph\n", "import numpy as np\n", "import pandas as pd\n", "\n", "def _as_prob_df(matrix, labels=None):\n", " \"\"\"Return a square DataFrame with index=columns=labels.\"\"\"\n", " if isinstance(matrix, pd.DataFrame):\n", " # Ensure square and aligned\n", " assert (matrix.index == matrix.columns).all(), \"Index/columns must match.\"\n", " return matrix\n", " matrix = np.asarray(matrix, dtype=float)\n", " assert matrix.shape[0] == matrix.shape[1], \"Matrix must be square.\"\n", " if labels is None:\n", " raise ValueError(\"labels are required when matrix is not a DataFrame\")\n", " assert len(labels) == matrix.shape[0], \"labels length must match matrix size.\"\n", " return pd.DataFrame(matrix, index=list(labels), columns=list(labels))\n", "\n", "def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2):\n", " \"\"\"Build weighted edges > threshold.\"\"\"\n", " edges = []\n", " for src in P.index:\n", " for dst in P.columns:\n", " w = float(P.loc[src, dst])\n", " if w > threshold:\n", " edges.append((str(src), str(dst), f\"{w:.{round_digits}f}\"))\n", " return edges\n", "\n", "def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt=\"svg\", view=False):\n", " \"\"\"\n", " fname: output file stem (no extension)\n", " matrix: NumPy array or pandas DataFrame of transition PROBABILITIES\n", " ls_index: ordered labels (required if matrix is not a DataFrame)\n", " threshold: hide edges with weight <= threshold\n", " fmt: 'svg'|'png'|'pdf' etc.\n", " view: open after rendering\n", " \"\"\"\n", " P = _as_prob_df(matrix, labels=ls_index)\n", " edges = _df_to_edgelist(P, threshold=threshold)\n", "\n", " g = Digraph(format=fmt)\n", " g.attr(rankdir=\"LR\", size=\"30\")\n", " g.attr(\"node\", shape=\"circle\")\n", "\n", " # ensure isolated nodes appear\n", " for node in P.index:\n", " g.node(str(node), width=\"1\", height=\"1\")\n", "\n", " for src, dst, label in edges:\n", " g.edge(src, dst, label=label)\n", "\n", " g.render(fname, view=view, cleanup=True)\n", " return g\n" ] }, { "cell_type": "code", "execution_count": 58, "id": "e255a2c1-6454-4e5e-89f6-ef8ac51ab6cc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "238dc588-a7ab-4c0e-bccd-6abca5076c66\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "page_view\n", "\n", "page_view\n", "\n", "\n", "\n", "view_item_page\n", "\n", "view_item_page\n", "\n", "\n", "\n", "page_view->view_item_page\n", "\n", "\n", "1.00\n", "\n", "\n", "\n", "view_item_page->view_item_page\n", "\n", "\n", "0.19\n", "\n", "\n", "\n", "hover_over_title\n", "\n", "hover_over_title\n", "\n", "\n", "\n", "view_item_page->hover_over_title\n", "\n", "\n", "0.38\n", "\n", "\n", "\n", "hover_over_paragraph\n", "\n", "hover_over_paragraph\n", "\n", "\n", "\n", "view_item_page->hover_over_paragraph\n", "\n", "\n", "0.44\n", "\n", "\n", "\n", "hover_over_title->view_item_page\n", "\n", "\n", "1.00\n", "\n", "\n", "\n", "hover_over_paragraph->page_view\n", "\n", "\n", "0.14\n", "\n", "\n", "\n", "hover_over_paragraph->view_item_page\n", "\n", "\n", "0.86\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[[0. 1. 0. 0. ]\n", " [0. 0.1875 0.375 0.4375 ]\n", " [0. 1. 0. 0. ]\n", " [0.14285714 0.85714286 0. 0. ]]\n", "f0317a5d-e424-44e9-b784-c8f7291ffe31\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "page_view\n", "\n", "page_view\n", "\n", "\n", "\n", "page_view->page_view\n", "\n", "\n", "0.50\n", "\n", "\n", "\n", "view_item_page\n", "\n", "view_item_page\n", "\n", "\n", "\n", "page_view->view_item_page\n", "\n", "\n", "0.50\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[[5.0e-001 5.0e-001]\n", " [9.9e-324 1.5e-323]]\n", "d176d7c9-4027-4702-9e31-2a71395cdda0\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "page_view\n", "\n", "page_view\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[[0.]]\n", "013fc334-4045-4d5a-8739-dd0a8766a63b\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "page_view\n", "\n", "page_view\n", "\n", "\n", "\n", "view_item_page\n", "\n", "view_item_page\n", "\n", "\n", "\n", "page_view->view_item_page\n", "\n", "\n", "1.00\n", "\n", "\n", "\n", "view_item_page->view_item_page\n", "\n", "\n", "0.68\n", "\n", "\n", "\n", "hover_over_title\n", "\n", "hover_over_title\n", "\n", "\n", "\n", "view_item_page->hover_over_title\n", "\n", "\n", "0.29\n", "\n", "\n", "\n", "hover_over_paragraph\n", "\n", "hover_over_paragraph\n", "\n", "\n", "\n", "view_item_page->hover_over_paragraph\n", "\n", "\n", "0.04\n", "\n", "\n", "\n", "hover_over_title->view_item_page\n", "\n", "\n", "1.00\n", "\n", "\n", "\n", "hover_over_paragraph->page_view\n", "\n", "\n", "0.14\n", "\n", "\n", "\n", "hover_over_paragraph->view_item_page\n", "\n", "\n", "0.86\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[[0. 1. 0. 0. ]\n", " [0. 0.67857143 0.28571429 0.03571429]\n", " [0. 1. 0. 0. ]\n", " [0.14285714 0.85714286 0. 0. ]]\n" ] } ], "source": [ "def explore_session(session_id: str):\n", " subset = df[df['sessionId'] == session_id]\n", " print(session_id)\n", " P, labels = build_transition_prob_matrix(subset)\n", " g = render_graph(f\"session_{session_id}\", P, ls_index=labels, threshold=0.01, fmt=\"svg\", view=False)\n", " display(g)\n", " return P\n", "for session in sessions:\n", " print(explore_session(session))" ] }, { "cell_type": "code", "execution_count": null, "id": "4d278c2d-406e-4dc0-b219-5f7b236e852b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (PHANTOM)", "language": "python", "name": "phantom" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }