From ab8b8787a85875559146a92429611ed7781bd142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Alves=20R=C3=B6sel?= <60182044+velocitatem@users.noreply.github.com> Date: Sat, 15 Nov 2025 16:16:01 +0100 Subject: [PATCH] 13 agentic behavior runner v1 (#14) * baseline setup of agent abstract * feat: new implementation of simple AI agent that can follow a goal and return * refactored import structure and created full tests * pytest setup a github workflow to run tests + more ignores * singularity for pushing * fixing builds of PDFs * inital structure of docs * init styles and docs * basic style implementation * 13 create outline for research paper draft (#18) * updated outline for paper from issue * extra paper sections and some formalization of series data * algorithms and acknowledgements * updated outline for paper from issue * Refactor docker-compose services to use individual Dockerfiles (#20) * Initial plan * Refactor services into individual Dockerfiles Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> * Add EXPOSE directives to all Dockerfiles with port documentation Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> * 2 nextjs scaffold with store mode shop and admin session experiment wiring event emission v1 (#17) * chore: cleaning gitignore * formating and env documentation * feat: context switching of hotel/airline depndent on env var via middleware * fixed alignment and building * wrong file * prods * fixed applying style * better session cookie management * tentative session storage with maybe using airtable * migrated api of ingestion * events and products apge * fixing build * 13 create outline for research paper draft (#18) * updated outline for paper from issue * extra paper sections and some formalization of series data * algorithms and acknowledgements * updated outline for paper from issue * upadted text formating * event unification * refactor tracking to ues callbacks instead of refs * implement a pricing display api with session passing * moved middleware to proxy according to new changes in Nextjs * refactoed kafka ingestion to go via backend not web-db * Refactor docker-compose services to use individual Dockerfiles (#20) * Initial plan * Refactor services into individual Dockerfiles Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> * Add EXPOSE directives to all Dockerfiles with port documentation Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: velocitatem <60182044+velocitatem@users.noreply.github.com> * fixing small bugs and adding exepriments to tracking * added some doc * fixing prod * prod kafka server logging * topic auto create * pytest setup a github workflow to run tests + more ignores * getting data from agents properly * proper pipeline to handle data and build matrices * fixing backend dumping * fixing agents and ignore * fixing import for tests --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> --- .github/workflows/pytest.yml | 30 + .gitignore | 8 +- Makefile | 15 +- backend/server/app.py | 66 +- experiments/__init__.py | 0 experiments/agents/__init__.py | 1 + experiments/agents/agent.py | 44 ++ experiments/agents/base.py | 19 + experiments/agents/test.py | 30 + experiments/data_export.ipynb | 1169 ++++++++++++----------------- experiments/procesing/extract.py | 84 +++ experiments/procesing/mapping.py | 158 ++++ experiments/procesing/pipeline.py | 19 + paper/concat_code.sh | 5 +- pytest.ini | 7 + requirements.txt | 5 + 16 files changed, 955 insertions(+), 705 deletions(-) create mode 100644 .github/workflows/pytest.yml create mode 100644 experiments/__init__.py create mode 100644 experiments/agents/__init__.py create mode 100644 experiments/agents/agent.py create mode 100644 experiments/agents/base.py create mode 100644 experiments/agents/test.py create mode 100644 experiments/procesing/extract.py create mode 100644 experiments/procesing/mapping.py create mode 100644 experiments/procesing/pipeline.py create mode 100644 pytest.ini diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..34d506b --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,30 @@ +name: Run Tests +on: + push: + paths: + - 'experiments/**' + - 'backend/**' + - 'requirements.txt' + - '.github/workflows/pytest.yml' + pull_request: + paths: + - 'experiments/**' + - 'backend/**' + - 'requirements.txt' + - '.github/workflows/pytest.yml' +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + - name: Install dependencies + run: | + python -m venv .venv + .venv/bin/pip install --upgrade pip + .venv/bin/pip install -r requirements.txt + - name: Run tests + run: .venv/bin/pytest -v diff --git a/.gitignore b/.gitignore index 7cdbf14..18da4dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ **/.env **/.venv -PHANTOM.wiki/ +**/__pycache__ +**/.ipynb_checkpoints/ **/.virtual_documents/ -**/__pycache__/ -**/.ipynb_checkpoints/ \ No newline at end of file +**/session_*.svg +**/*graph.svg +paper/src/bib/auto diff --git a/Makefile b/Makefile index 99c54f8..d9eaac5 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,10 @@ BUILDDIR := build TEX := main.tex JOBNAME := main PDF := paper/$(BUILDDIR)/$(JOBNAME).pdf +VENV := .venv +PYTHON := $(VENV)/bin/python +PIP := $(VENV)/bin/pip +PYTEST := $(VENV)/bin/pytest .DEFAULT_GOAL := help @@ -35,5 +39,14 @@ clean: $(LATEXMK) -C -jobname=$(JOBNAME) -outdir=../$(BUILDDIR) || true rm -rf paper/$(BUILDDIR)/* +$(VENV): + python3 -m venv $(VENV) + $(PIP) install --upgrade pip -.PHONY: all pdf clean watch run.webapp +install: $(VENV) + $(PIP) install -r requirements.txt + +test: $(VENV) + $(PYTEST) -v + +.PHONY: all pdf clean watch run.webapp install test diff --git a/backend/server/app.py b/backend/server/app.py index d57d1de..3830058 100644 --- a/backend/server/app.py +++ b/backend/server/app.py @@ -7,7 +7,7 @@ import uvicorn import os import json from datetime import datetime -from kafka import KafkaProducer, KafkaAdminClient +from kafka import KafkaProducer, KafkaAdminClient, KafkaConsumer from kafka.admin import NewTopic from kafka.errors import TopicAlreadyExistsError from dotenv import load_dotenv @@ -22,7 +22,7 @@ def get_producer() -> KafkaProducer: global _producer if _producer is None: host = os.getenv('KAFKA_HOST', 'localhost') - port = os.getenv('KAFKA_PORT', '29092') # use internal broker port + port = os.getenv('KAFKA_PORT', '9092') broker = f'{host}:{port}' if port else host print(f"[KAFKA_INIT] Connecting to broker: {broker}") _producer = KafkaProducer( @@ -61,7 +61,7 @@ app.add_middleware( async def startup_event(): """create kafka topics on startup""" host = os.getenv('KAFKA_HOST', 'localhost') - port = os.getenv('KAFKA_PORT', '29092') + port = os.getenv('KAFKA_PORT', '9092') broker = f'{host}:{port}' try: @@ -125,10 +125,62 @@ async def ingest_logs(event: EventPayload): raise HTTPException(status_code=500, detail=str(e)) @app.get("/api/kafka/dump") -def dump_logs(): - # TODO: implement a dump of logs of time period t_start to t_end (params of get) - # OR: allow for params of last_n logs as a param - creating two modes of the dumping - pass +def dump_logs( + last_n: Optional[int] = None, + t_start: Optional[str] = None, + t_end: Optional[str] = None +): + """dump all messages from user-interactions topic + + params: + last_n: return only last n messages (default: all) + t_start: filter by start timestamp iso format (future use) + t_end: filter by end timestamp iso format (future use) + """ + host = os.getenv('KAFKA_HOST', 'localhost') + port = os.getenv('KAFKA_PORT', '9092') + broker = f'{host}:{port}' + + try: + consumer = KafkaConsumer( + 'user-interactions', + bootstrap_servers=[broker], + auto_offset_reset='earliest', + enable_auto_commit=False, + value_deserializer=lambda x: json.loads(x.decode('utf-8')), + consumer_timeout_ms=5000 + ) + + events = [] + for msg in consumer: + events.append(msg.value) + + consumer.close() + + # apply filters + if t_start or t_end: + # filter by timestamp range if provided + filtered = [] + for e in events: + ts = e.get('ts') + if ts: + if t_start and ts < t_start: + continue + if t_end and ts > t_end: + continue + filtered.append(e) + events = filtered + + if last_n and last_n > 0: + events = events[-last_n:] + + return {"success": True, "count": len(events), "data": events} + + except Exception as e: + import traceback + print(f"[DUMP_ERROR] {e}") + print(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiments/agents/__init__.py b/experiments/agents/__init__.py new file mode 100644 index 0000000..041baf6 --- /dev/null +++ b/experiments/agents/__init__.py @@ -0,0 +1 @@ +"""Agentic behavior runner for PHANTOM research platform.""" diff --git a/experiments/agents/agent.py b/experiments/agents/agent.py new file mode 100644 index 0000000..c31e6b2 --- /dev/null +++ b/experiments/agents/agent.py @@ -0,0 +1,44 @@ +from .base import Agent as BaseAgent +from browser_use import Browser, Agent, ChatOpenAI +from enum import Enum + +class AgentTypes(str, Enum): + GENERIC_BROWSER_USE_AGENT = "generic_browser_use_agent" + +def _build_prompt(goal : str, environment_url : str) -> str: + #TODO: Improve prompt engineering here and experiment with + return f"""You are an autonomous agent tasked with achieving the following goal: {goal} +You have access to a web browser to interact with the environment at {environment_url}. +Use the browser to navigate, gather information, and perform actions necessary to accomplish your goal. +Be thorough and ensure you complete the task fully.""" + +class GenericBrowserUseAgent(BaseAgent): + def __init__(self, + goal: str, + url: str = "http://localhost:3000", + timeout: int = 300, + llm_model: str = "gpt-5-mini", + headless: bool = True): + super().__init__(goal, url, timeout) + self.llm_model = ChatOpenAI(model=llm_model) + self.browser = Browser(headless=headless) + self.agent = Agent(task=_build_prompt(goal, url), + llm=self.llm_model, + browser=self.browser) + async def act(self) -> str: + self.result = await self.agent.run() + # https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L301 + return self.result.final_result() + +def get_agent(agent_type: AgentTypes, **kwargs) -> Agent: + if agent_type == AgentTypes.GENERIC_BROWSER_USE_AGENT: + return GenericBrowserUseAgent(**kwargs) + else: + raise ValueError(f"Unknown agent type: {agent_type}") + +if __name__ == "__main__": + import asyncio + JTBD= "Name all the products on this site and try to find out more about each product by clicking into them (they might not open)" + agent = get_agent(AgentTypes.GENERIC_BROWSER_USE_AGENT, goal=JTBD, url="http://localhost:3000/products", timeout=300) + R=asyncio.run(agent.act()) + print(R) diff --git a/experiments/agents/base.py b/experiments/agents/base.py new file mode 100644 index 0000000..d9800e5 --- /dev/null +++ b/experiments/agents/base.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod +from typing import Optional + +class Agent(ABC): + """Base interface for browser automation agents""" + + def __init__(self, goal: str, url: str = "http://localhost:3000", timeout: int = 300): + self.goal = goal + self.url = url + self.timeout = timeout + self.result: Optional[str] = None + + @abstractmethod + async def act(self) -> str: + """Execute goal and return result text""" + pass + + def final_result(self) -> Optional[str]: + return self.result diff --git a/experiments/agents/test.py b/experiments/agents/test.py new file mode 100644 index 0000000..e7cbeb2 --- /dev/null +++ b/experiments/agents/test.py @@ -0,0 +1,30 @@ +import pytest +import asyncio +from experiments.agents.agent import get_agent, AgentTypes +import os + + +def test_agent_init(): + agent = get_agent(AgentTypes.GENERIC_BROWSER_USE_AGENT, goal="test", url="http://example.com", timeout=100) + assert agent.goal == "test" + assert agent.url == "http://example.com" + assert agent.timeout == 100 + + +def test_invalid_agent(): + with pytest.raises(ValueError): + get_agent("invalid", goal="test") + + +@pytest.mark.asyncio +@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY not set") +async def test_agent_execution(): + agent = get_agent(AgentTypes.GENERIC_BROWSER_USE_AGENT, goal="get page title", url="https://example.com", timeout=60) + + result = await agent.act() + assert result + assert agent.final_result() + assert agent.final_result().history[-1].result[-1].is_done == True + assert isinstance(result, str) + assert "example" in result.lower() + assert len(result) > 0 diff --git a/experiments/data_export.ipynb b/experiments/data_export.ipynb index 4ba73fb..7cd9366 100644 --- a/experiments/data_export.ipynb +++ b/experiments/data_export.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "62eafcd9-5462-4063-8873-0e7fb9add907", "metadata": {}, "outputs": [ @@ -12,7 +12,7 @@ "True" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "4af65cb4-e8cf-4877-b2db-13ac19f3838f", "metadata": {}, "outputs": [ @@ -40,31 +40,25 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 528 entries, 0 to 527\n", - "Data columns (total 19 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 sessionId 528 non-null object \n", - " 1 eventType 467 non-null object \n", - " 2 ts 528 non-null object \n", - " 3 targetEl 401 non-null object \n", - " 4 eventName 61 non-null object \n", - " 5 page 61 non-null object \n", - " 6 storeMode 61 non-null object \n", - " 7 userAgent 61 non-null object \n", - " 8 productId 21 non-null object \n", - " 9 metadata_path 467 non-null object \n", - " 10 metadata_referrer 82 non-null object \n", - " 11 metadata_x 425 non-null float64\n", - " 12 metadata_y 425 non-null float64\n", - " 13 metadata_event 7 non-null object \n", - " 14 metadata_targetEl 24 non-null object \n", - " 15 metadata_roomType 5 non-null object \n", - " 16 metadata_price 5 non-null float64\n", - " 17 metadata_nights 5 non-null float64\n", - " 18 metadata_targetUrl 4 non-null object \n", - "dtypes: float64(4), object(15)\n", - "memory usage: 78.5+ KB\n" + "RangeIndex: 73 entries, 0 to 72\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sessionId 73 non-null object \n", + " 1 eventName 73 non-null object \n", + " 2 page 73 non-null object \n", + " 3 productId 67 non-null object \n", + " 4 storeMode 73 non-null object \n", + " 5 userAgent 73 non-null object \n", + " 6 ts 73 non-null object \n", + " 7 metadata_referrer 6 non-null object \n", + " 8 metadata_roomType 45 non-null object \n", + " 9 metadata_price 45 non-null float64\n", + " 10 metadata_nights 45 non-null float64\n", + " 11 metadata_elementText 22 non-null object \n", + " 12 metadata_dwellTime 22 non-null float64\n", + "dtypes: float64(3), object(10)\n", + "memory usage: 7.5+ KB\n" ] } ], @@ -75,7 +69,7 @@ " topic, \n", " enable_auto_commit=True,\n", " value_deserializer=lambda x: json.loads(x.decode('utf-8')),\n", - " auto_offset_reset='earliest',\n", + " auto_offset_reset='earliest', \n", " bootstrap_servers=['localhost:9092'])\n", "messages=consumer.poll(timeout_ms=1000,max_records=10000)\n", "df = []\n", @@ -90,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "f6819a1c-32ab-49c7-845b-5df7bf60f561", "metadata": {}, "outputs": [ @@ -116,39 +110,30 @@ " \n", " \n", " sessionId\n", - " eventType\n", - " ts\n", - " targetEl\n", " eventName\n", " page\n", + " productId\n", " storeMode\n", " userAgent\n", - " productId\n", - " metadata_path\n", + " ts\n", " metadata_referrer\n", - " metadata_x\n", - " metadata_y\n", - " metadata_event\n", - " metadata_targetEl\n", " metadata_roomType\n", " metadata_price\n", " metadata_nights\n", - " metadata_targetUrl\n", + " metadata_elementText\n", + " metadata_dwellTime\n", " \n", " \n", " \n", " \n", " 0\n", - " 1762434923440-66hdhq8qicd\n", - " pageview\n", - " 1762434924107\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", + " d176d7c9-4027-4702-9e31-2a71395cdda0\n", + " page_view\n", + " /products\n", + " None\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:23:46.270Z\n", " \n", " NaN\n", " NaN\n", @@ -158,23 +143,19 @@ " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 1\n", - " 1762434923440-66hdhq8qicd\n", - " click\n", - " 1762434925198\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " f0317a5d-e424-44e9-b784-c8f7291ffe31\n", + " page_view\n", " /\n", - " NaN\n", - " 1098.0\n", - " 663.0\n", - " NaN\n", + " None\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...\n", + " 2025-11-14T13:26:00.291Z\n", + " \n", " NaN\n", " NaN\n", " NaN\n", @@ -183,20 +164,14 @@ " \n", " \n", " 2\n", - " 1762434923440-66hdhq8qicd\n", - " click\n", - " 1762434925371\n", - " MAIN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 1098.0\n", - " 663.0\n", - " NaN\n", + " f0317a5d-e424-44e9-b784-c8f7291ffe31\n", + " page_view\n", + " /products\n", + " None\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...\n", + " 2025-11-14T13:26:07.769Z\n", + " \n", " NaN\n", " NaN\n", " NaN\n", @@ -205,216 +180,49 @@ " \n", " \n", " 3\n", - " 1762434923440-66hdhq8qicd\n", - " pageview\n", - " 1762437192910\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " \n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " f0317a5d-e424-44e9-b784-c8f7291ffe31\n", + " view_item_page\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...\n", + " 2025-11-14T13:26:15.010Z\n", " NaN\n", + " Premium Room\n", + " 269.0\n", + " 1.0\n", " NaN\n", " NaN\n", " \n", " \n", " 4\n", - " 1762434923440-66hdhq8qicd\n", - " pageview\n", - " 1762437198539\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", + " 238dc588-a7ab-4c0e-bccd-6abca5076c66\n", + " page_view\n", + " /products\n", + " None\n", + " hotel\n", + " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...\n", + " 2025-11-14T13:27:15.457Z\n", " \n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", - " 390\n", - " d423ce8a-77aa-4c9a-94d4-d1adddcc3472\n", - " click\n", - " 1762443115648\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 245.0\n", - " 595.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 391\n", - " d423ce8a-77aa-4c9a-94d4-d1adddcc3472\n", - " click\n", - " 1762443174606\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 475.0\n", - " 428.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 392\n", - " d423ce8a-77aa-4c9a-94d4-d1adddcc3472\n", - " click\n", - " 1762443183406\n", - " INPUT\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 832.0\n", - " 219.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 393\n", - " d423ce8a-77aa-4c9a-94d4-d1adddcc3472\n", - " click\n", - " 1762443208588\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 485.0\n", - " 155.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 394\n", - " d423ce8a-77aa-4c9a-94d4-d1adddcc3472\n", - " click\n", - " 1762443225474\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 281.0\n", - " 281.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 407\n", - " 1762444018243-0120z6z5u42f\n", - " pageview\n", - " 1762444018256\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " \n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 408\n", - " 1762444018243-0120z6z5u42f\n", - " click\n", - " 1762445774344\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 299.0\n", - " 214.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 431\n", - " 214d9fad-9b00-40c3-bd0e-7739b6acd654\n", - " pageview\n", - " 1762448190973\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " \n", - " NaN\n", + " 5\n", + " 238dc588-a7ab-4c0e-bccd-6abca5076c66\n", + " view_item_page\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...\n", + " 2025-11-14T13:27:15.591Z\n", + " NaN\n", + " Premium Room\n", + " 264.0\n", + " 2.0\n", " NaN\n", " NaN\n", " NaN\n", @@ -446,128 +254,96 @@ " NaN\n", " \n", " \n", - " 433\n", - " 214d9fad-9b00-40c3-bd0e-7739b6acd654\n", - " click\n", - " 1762448192645\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 1623.0\n", - " 493.0\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 6\n", + " 238dc588-a7ab-4c0e-bccd-6abca5076c66\n", + " view_item_page\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...\n", + " 2025-11-14T13:27:21.483Z\n", " NaN\n", + " Premium Room\n", + " 264.0\n", + " 2.0\n", " NaN\n", " NaN\n", " \n", " \n", - " 434\n", - " 214d9fad-9b00-40c3-bd0e-7739b6acd654\n", - " pageview\n", - " 1762448205850\n", + " 7\n", + " 238dc588-a7ab-4c0e-bccd-6abca5076c66\n", + " hover_over_title\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...\n", + " 2025-11-14T13:27:22.646Z\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " Grand Plaza Hotel\n", + " 1200.0\n", + " \n", + " \n", + " 8\n", + " 238dc588-a7ab-4c0e-bccd-6abca5076c66\n", + " view_item_page\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...\n", + " 2025-11-14T13:27:25.889Z\n", + " NaN\n", + " Premium Room\n", + " 264.0\n", + " 2.0\n", " NaN\n", " NaN\n", - " /\n", + " \n", + " \n", + " 35\n", + " 013fc334-4045-4d5a-8739-dd0a8766a63b\n", + " page_view\n", + " /products\n", + " None\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:53:59.993Z\n", " \n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " \n", + " \n", + " 36\n", + " 013fc334-4045-4d5a-8739-dd0a8766a63b\n", + " view_item_page\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:54:10.705Z\n", " NaN\n", + " Premium Room\n", + " 223.0\n", + " 3.0\n", " NaN\n", " NaN\n", " \n", " \n", - " 435\n", - " 214d9fad-9b00-40c3-bd0e-7739b6acd654\n", - " click\n", - " 1762448207922\n", - " DIV\n", + " 37\n", + " 013fc334-4045-4d5a-8739-dd0a8766a63b\n", + " hover_over_title\n", + " /products\n", + " htl-0\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:54:11.771Z\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 421.0\n", - " 216.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 438\n", - " f0d40ca6-c1d3-4ecd-beb3-796adc74349d\n", - " pageview\n", - " 1762448283244\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " \n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 439\n", - " f0d40ca6-c1d3-4ecd-beb3-796adc74349d\n", - " click\n", - " 1762448295524\n", - " HTML\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 614.0\n", - " 720.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 440\n", - " f0d40ca6-c1d3-4ecd-beb3-796adc74349d\n", - " click\n", - " 1762448342763\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", " 416.0\n", " 397.0\n", " NaN\n", @@ -576,178 +352,112 @@ " NaN\n", " NaN\n", " NaN\n", + " Grand Plaza Hotel\n", + " 1200.0\n", " \n", " \n", - " 441\n", - " f0d40ca6-c1d3-4ecd-beb3-796adc74349d\n", - " pageview\n", - " 1762448343396\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " \n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 38\n", + " 013fc334-4045-4d5a-8739-dd0a8766a63b\n", + " view_item_page\n", + " /products\n", + " htl-1\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:54:29.772Z\n", " NaN\n", + " Standard Room\n", + " 267.0\n", + " 5.0\n", " NaN\n", " NaN\n", " \n", " \n", - " 442\n", - " f0d40ca6-c1d3-4ecd-beb3-796adc74349d\n", - " click\n", - " 1762448829631\n", - " DIV\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " /\n", - " NaN\n", - " 45.0\n", - " 44.0\n", - " NaN\n", - " NaN\n", + " 39\n", + " 013fc334-4045-4d5a-8739-dd0a8766a63b\n", + " hover_over_title\n", + " /products\n", + " htl-1\n", + " hotel\n", + " Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...\n", + " 2025-11-14T13:54:30.833Z\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " Seaside Resort\n", + " 1200.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sessionId eventType ts targetEl \\\n", - "0 1762434923440-66hdhq8qicd pageview 1762434924107 NaN \n", - "1 1762434923440-66hdhq8qicd click 1762434925198 DIV \n", - "2 1762434923440-66hdhq8qicd click 1762434925371 MAIN \n", - "3 1762434923440-66hdhq8qicd pageview 1762437192910 NaN \n", - "4 1762434923440-66hdhq8qicd pageview 1762437198539 NaN \n", - "390 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 click 1762443115648 DIV \n", - "391 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 click 1762443174606 DIV \n", - "392 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 click 1762443183406 INPUT \n", - "393 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 click 1762443208588 DIV \n", - "394 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 click 1762443225474 DIV \n", - "407 1762444018243-0120z6z5u42f pageview 1762444018256 NaN \n", - "408 1762444018243-0120z6z5u42f click 1762445774344 DIV \n", - "431 214d9fad-9b00-40c3-bd0e-7739b6acd654 pageview 1762448190973 NaN \n", - "432 214d9fad-9b00-40c3-bd0e-7739b6acd654 click 1762448192425 DIV \n", - "433 214d9fad-9b00-40c3-bd0e-7739b6acd654 click 1762448192645 DIV \n", - "434 214d9fad-9b00-40c3-bd0e-7739b6acd654 pageview 1762448205850 NaN \n", - "435 214d9fad-9b00-40c3-bd0e-7739b6acd654 click 1762448207922 DIV \n", - "438 f0d40ca6-c1d3-4ecd-beb3-796adc74349d pageview 1762448283244 NaN \n", - "439 f0d40ca6-c1d3-4ecd-beb3-796adc74349d click 1762448295524 HTML \n", - "440 f0d40ca6-c1d3-4ecd-beb3-796adc74349d click 1762448342763 DIV \n", - "441 f0d40ca6-c1d3-4ecd-beb3-796adc74349d pageview 1762448343396 NaN \n", - "442 f0d40ca6-c1d3-4ecd-beb3-796adc74349d click 1762448829631 DIV \n", + " sessionId eventName page \\\n", + "0 d176d7c9-4027-4702-9e31-2a71395cdda0 page_view /products \n", + "1 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view / \n", + "2 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view /products \n", + "3 f0317a5d-e424-44e9-b784-c8f7291ffe31 view_item_page /products \n", + "4 238dc588-a7ab-4c0e-bccd-6abca5076c66 page_view /products \n", + "5 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "6 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "7 238dc588-a7ab-4c0e-bccd-6abca5076c66 hover_over_title /products \n", + "8 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "35 013fc334-4045-4d5a-8739-dd0a8766a63b page_view /products \n", + "36 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", + "37 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", + "38 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", + "39 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", "\n", - " eventName page storeMode userAgent productId metadata_path \\\n", - "0 NaN NaN NaN NaN NaN / \n", - "1 NaN NaN NaN NaN NaN / \n", - "2 NaN NaN NaN NaN NaN / \n", - "3 NaN NaN NaN NaN NaN / \n", - "4 NaN NaN NaN NaN NaN / \n", - "390 NaN NaN NaN NaN NaN / \n", - "391 NaN NaN NaN NaN NaN / \n", - "392 NaN NaN NaN NaN NaN / \n", - "393 NaN NaN NaN NaN NaN / \n", - "394 NaN NaN NaN NaN NaN / \n", - "407 NaN NaN NaN NaN NaN / \n", - "408 NaN NaN NaN NaN NaN / \n", - "431 NaN NaN NaN NaN NaN / \n", - "432 NaN NaN NaN NaN NaN / \n", - "433 NaN NaN NaN NaN NaN / \n", - "434 NaN NaN NaN NaN NaN / \n", - "435 NaN NaN NaN NaN NaN / \n", - "438 NaN NaN NaN NaN NaN / \n", - "439 NaN NaN NaN NaN NaN / \n", - "440 NaN NaN NaN NaN NaN / \n", - "441 NaN NaN NaN NaN NaN / \n", - "442 NaN NaN NaN NaN NaN / \n", + " productId storeMode userAgent \\\n", + "0 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "1 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "2 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "3 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "4 None hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "5 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "6 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "7 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "8 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "35 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "36 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "37 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "38 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "39 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", "\n", - " metadata_referrer metadata_x metadata_y metadata_event \\\n", - "0 NaN NaN NaN \n", - "1 NaN 1098.0 663.0 NaN \n", - "2 NaN 1098.0 663.0 NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "390 NaN 245.0 595.0 NaN \n", - "391 NaN 475.0 428.0 NaN \n", - "392 NaN 832.0 219.0 NaN \n", - "393 NaN 485.0 155.0 NaN \n", - "394 NaN 281.0 281.0 NaN \n", - "407 NaN NaN NaN \n", - "408 NaN 299.0 214.0 NaN \n", - "431 NaN NaN NaN \n", - "432 NaN 1623.0 493.0 NaN \n", - "433 NaN 1623.0 493.0 NaN \n", - "434 NaN NaN NaN \n", - "435 NaN 421.0 216.0 NaN \n", - "438 NaN NaN NaN \n", - "439 NaN 614.0 720.0 NaN \n", - "440 NaN 416.0 397.0 NaN \n", - "441 NaN NaN NaN \n", - "442 NaN 45.0 44.0 NaN \n", + " ts metadata_referrer metadata_roomType \\\n", + "0 2025-11-14T13:23:46.270Z NaN \n", + "1 2025-11-14T13:26:00.291Z NaN \n", + "2 2025-11-14T13:26:07.769Z NaN \n", + "3 2025-11-14T13:26:15.010Z NaN Premium Room \n", + "4 2025-11-14T13:27:15.457Z NaN \n", + "5 2025-11-14T13:27:15.591Z NaN Premium Room \n", + "6 2025-11-14T13:27:21.483Z NaN Premium Room \n", + "7 2025-11-14T13:27:22.646Z NaN NaN \n", + "8 2025-11-14T13:27:25.889Z NaN Premium Room \n", + "35 2025-11-14T13:53:59.993Z NaN \n", + "36 2025-11-14T13:54:10.705Z NaN Premium Room \n", + "37 2025-11-14T13:54:11.771Z NaN NaN \n", + "38 2025-11-14T13:54:29.772Z NaN Standard Room \n", + "39 2025-11-14T13:54:30.833Z NaN NaN \n", "\n", - " metadata_targetEl metadata_roomType metadata_price metadata_nights \\\n", - "0 NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "390 NaN NaN NaN NaN \n", - "391 NaN NaN NaN NaN \n", - "392 NaN NaN NaN NaN \n", - "393 NaN NaN NaN NaN \n", - "394 NaN NaN NaN NaN \n", - "407 NaN NaN NaN NaN \n", - "408 NaN NaN NaN NaN \n", - "431 NaN NaN NaN NaN \n", - "432 NaN NaN NaN NaN \n", - "433 NaN NaN NaN NaN \n", - "434 NaN NaN NaN NaN \n", - "435 NaN NaN NaN NaN \n", - "438 NaN NaN NaN NaN \n", - "439 NaN NaN NaN NaN \n", - "440 NaN NaN NaN NaN \n", - "441 NaN NaN NaN NaN \n", - "442 NaN NaN NaN NaN \n", - "\n", - " metadata_targetUrl \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "390 NaN \n", - "391 NaN \n", - "392 NaN \n", - "393 NaN \n", - "394 NaN \n", - "407 NaN \n", - "408 NaN \n", - "431 NaN \n", - "432 NaN \n", - "433 NaN \n", - "434 NaN \n", - "435 NaN \n", - "438 NaN \n", - "439 NaN \n", - "440 NaN \n", - "441 NaN \n", - "442 NaN " + " metadata_price metadata_nights metadata_elementText metadata_dwellTime \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 269.0 1.0 NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "5 264.0 2.0 NaN NaN \n", + "6 264.0 2.0 NaN NaN \n", + "7 NaN NaN Grand Plaza Hotel 1200.0 \n", + "8 264.0 2.0 NaN NaN \n", + "35 NaN NaN NaN NaN \n", + "36 223.0 3.0 NaN NaN \n", + "37 NaN NaN Grand Plaza Hotel 1200.0 \n", + "38 267.0 5.0 NaN NaN \n", + "39 NaN NaN Seaside Resort 1200.0 " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -758,32 +468,31 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "380eca5f-8304-4fb2-be32-e8bcfd312085", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['214d9fad-9b00-40c3-bd0e-7739b6acd654',\n", - " '1762444018243-0120z6z5u42f',\n", - " 'f0d40ca6-c1d3-4ecd-beb3-796adc74349d',\n", - " 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',\n", - " '1762434923440-66hdhq8qicd']" + "['013fc334-4045-4d5a-8739-dd0a8766a63b',\n", + " '238dc588-a7ab-4c0e-bccd-6abca5076c66',\n", + " 'd176d7c9-4027-4702-9e31-2a71395cdda0',\n", + " 'f0317a5d-e424-44e9-b784-c8f7291ffe31']" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sessions = list(set(df['sessionId'])); sessions" + "sessions = list(set(df['sessionId'])); sessions # 238dc588-a7ab-4c0e-bccd-6abca5076c66" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "f4ae6f81-dcb8-44be-aee7-30dbc3a6bae1", "metadata": {}, "outputs": [], @@ -793,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "050d90a4-20a9-47f5-b998-c31178a54cb3", "metadata": {}, "outputs": [], @@ -814,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "e68f9004-82f5-4826-aece-e3dc6e15a18f", "metadata": {}, "outputs": [], @@ -876,38 +585,15 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "e255a2c1-6454-4e5e-89f6-ef8ac51ab6cc", "metadata": {}, "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "[]\n" + "013fc334-4045-4d5a-8739-dd0a8766a63b\n" ] }, { @@ -919,10 +605,68 @@ "\n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n", + "view_item_page\n", + "\n", + "view_item_page\n", + "\n", + "\n", + "\n", + "page_view->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "view_item_page->view_item_page\n", + "\n", + "\n", + "0.68\n", + "\n", + "\n", + "\n", + "hover_over_title\n", + "\n", + "hover_over_title\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_title\n", + "\n", + "\n", + "0.29\n", + "\n", + "\n", + "\n", + "hover_over_paragraph\n", + "\n", + "hover_over_paragraph\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_paragraph\n", + "\n", + "\n", + "0.04\n", + "\n", + "\n", + "\n", + "hover_over_title->view_item_page\n", + "\n", + "\n", + "1.00\n", "\n", "\n" ], @@ -957,7 +701,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -967,7 +711,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "[]\n" + "[[0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n", + " [0.00000000e+000 6.78571429e-001 2.85714286e-001 3.57142857e-002]\n", + " [0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n", + " [2.05833592e-312 2.29175545e-312 4.94065646e-324 6.92110218e-310]]\n", + "238dc588-a7ab-4c0e-bccd-6abca5076c66\n" ] }, { @@ -979,109 +727,185 @@ "\n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "\n", "page_view\n", - "\n", - "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n", + "view_item_page\n", + "\n", + "view_item_page\n", + "\n", + "\n", + "\n", + "page_view->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "view_item_page->view_item_page\n", + "\n", + "\n", + "0.19\n", + "\n", + "\n", + "\n", + "hover_over_title\n", + "\n", + "hover_over_title\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_title\n", + "\n", + "\n", + "0.38\n", + "\n", + "\n", + "\n", + "hover_over_paragraph\n", + "\n", + "hover_over_paragraph\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_paragraph\n", + "\n", + "\n", + "0.44\n", + "\n", + "\n", + "\n", + "hover_over_title->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "hover_over_paragraph->page_view\n", + "\n", + "\n", + "0.14\n", + "\n", + "\n", + "\n", + "hover_over_paragraph->view_item_page\n", + "\n", + "\n", + "0.86\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 1. 0. 0. ]\n", + " [0. 0.1875 0.375 0.4375 ]\n", + " [0. 1. 0. 0. ]\n", + " [0.14285714 0.85714286 0. 0. ]]\n", + "d176d7c9-4027-4702-9e31-2a71395cdda0\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.]]\n", + "f0317a5d-e424-44e9-b784-c8f7291ffe31\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", "\n", "\n", "\n", "page_view->page_view\n", - "\n", - "\n", - "0.70\n", + "\n", + "\n", + "0.50\n", "\n", - "\n", + "\n", "\n", - "click\n", - "\n", - "click\n", + "view_item_page\n", + "\n", + "view_item_page\n", "\n", - "\n", + "\n", "\n", - "page_view->click\n", - "\n", - "\n", - "0.17\n", - "\n", - "\n", - "\n", - "product_hover\n", - "\n", - "product_hover\n", - "\n", - "\n", - "\n", - "page_view->product_hover\n", - "\n", - "\n", - "0.13\n", - "\n", - "\n", - "\n", - "click->page_view\n", - "\n", - "\n", - "0.35\n", - "\n", - "\n", - "\n", - "click->click\n", - "\n", - "\n", - "0.41\n", - "\n", - "\n", - "\n", - "click->product_hover\n", - "\n", - "\n", - "0.24\n", - "\n", - "\n", - "\n", - "product_hover->click\n", - "\n", - "\n", - "0.07\n", - "\n", - "\n", - "\n", - "product_hover->product_hover\n", - "\n", - "\n", - "0.60\n", - "\n", - "\n", - "\n", - "product_view\n", - "\n", - "product_view\n", - "\n", - "\n", - "\n", - "product_hover->product_view\n", - "\n", - "\n", - "0.33\n", - "\n", - "\n", - "\n", - "product_view->click\n", - "\n", - "\n", - "1.00\n", + "page_view->view_item_page\n", + "\n", + "\n", + "0.50\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1091,46 +915,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "[[0.69565217 0.17391304 0.13043478 0. ]\n", - " [0.35294118 0.41176471 0.23529412 0. ]\n", - " [0. 0.06666667 0.6 0.33333333]\n", - " [0. 1. 0. 0. ]]\n" - ] - }, - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" + "[[5.0e-001 5.0e-001]\n", + " [9.9e-324 1.5e-323]]\n" ] } ], "source": [ "def explore_session(session_id: str):\n", " subset = df[df['sessionId'] == session_id]\n", + " print(session_id)\n", " P, labels = build_transition_prob_matrix(subset)\n", " g = render_graph(f\"session_{session_id}\", P, ls_index=labels, threshold=0.01, fmt=\"svg\", view=False)\n", " display(g)\n", @@ -1138,14 +931,6 @@ "for session in sessions:\n", " print(explore_session(session))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d278c2d-406e-4dc0-b219-5f7b236e852b", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/experiments/procesing/extract.py b/experiments/procesing/extract.py new file mode 100644 index 0000000..cfe73e2 --- /dev/null +++ b/experiments/procesing/extract.py @@ -0,0 +1,84 @@ +import pandas as pd +import json +import numpy as np +import os +import requests +from dotenv import load_dotenv +from sklearn.base import BaseEstimator, TransformerMixin +load_dotenv() + +BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:5000") +N_PRICE_BUCKETS = 5 + +def get_data_from_kafka() -> pd.DataFrame: + """fetch all events from backend dump endpoint""" + resp = requests.get(f"{BACKEND_URL}/api/kafka/dump") + resp.raise_for_status() + data = resp.json() + + if not data.get('success') or not data.get('data'): + return pd.DataFrame() + + df = pd.DataFrame(data['data']) + # explode metadata col json + if 'metadata' in df.columns: + df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_")) + df = df.dropna(subset=['eventName']) + return df + + +def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame: + # TODO: Get experiments db from supabase and join on session_id + return df + + +def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame: + # from taking standard view_item_page in eventName to view_item_page_{metadata_schema} + # we want metadata schema to create product specific event names + + # only create price buckets if we have enough unique prices + if df["metadata_price"].notnull().sum() > 0: + try: + price_buckets = pd.qcut( + df["metadata_price"], + q=N_PRICE_BUCKETS, + labels=[f"PB_{i+1}" for i in range(N_PRICE_BUCKETS)], + duplicates='drop' # handle duplicate bin edges + ) + except ValueError: + # fallback: if still not enough unique values, use cut with fixed ranges or just use raw price + price_buckets = df["metadata_price"].apply(lambda x: f"P_{int(x)}" if pd.notnull(x) else "") + else: + price_buckets = pd.Series([""] * len(df), index=df.index) + + # metadata_schema: _product_id@price_bucket_{i} only if we have product metadata otherswise keep original event name + # TODO: make this adaptive, if we have hover_over_title we append the title, if its view_page we say which page + df["metadata_schema"] = np.where( + df["productId"].notnull() & df["metadata_price"].notnull(), + "_" + df["productId"].astype(str) + "@" + price_buckets.astype(str), + "" + ) + df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str) + return df + + +def extract() -> pd.DataFrame: + df = get_data_from_kafka() + df = join_with_experiments(df) + df = augment_event_titles(df) + return df + + +class DataExtractor(BaseEstimator, TransformerMixin): + def fit(self, X=None, y=None): + return self + + def transform(self, X=None): + return extract() + + +if __name__ == "__main__": + df = extract() + print(df.head()) + print(df.tail()) + print(df.info()) diff --git a/experiments/procesing/mapping.py b/experiments/procesing/mapping.py new file mode 100644 index 0000000..6c32b91 --- /dev/null +++ b/experiments/procesing/mapping.py @@ -0,0 +1,158 @@ +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + +def build_transition_prob_matrix(df: pd.DataFrame): + df = df.dropna(subset=['eventName']) + events = df['eventName'].tolist() + labels = pd.Index(events).unique().tolist() + idx = {e:i for i,e in enumerate(labels)} + M = np.zeros((len(labels), len(labels)), dtype=float) + for a, b in zip(events, events[1:]): + M[idx[a], idx[b]] += 1 + row_sums = M.sum(axis=1, keepdims=True) + with np.errstate(divide='ignore', invalid='ignore'): + P = np.divide(M, row_sums, where=row_sums>0) # row-normalized + return P, labels + +# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b +from graphviz import Digraph +import numpy as np +import pandas as pd + +def _as_prob_df(matrix, labels=None): + """Return a square DataFrame with index=columns=labels.""" + if isinstance(matrix, pd.DataFrame): + # Ensure square and aligned + assert (matrix.index == matrix.columns).all(), "Index/columns must match." + return matrix + matrix = np.asarray(matrix, dtype=float) + assert matrix.shape[0] == matrix.shape[1], "Matrix must be square." + if labels is None: + raise ValueError("labels are required when matrix is not a DataFrame") + assert len(labels) == matrix.shape[0], "labels length must match matrix size." + return pd.DataFrame(matrix, index=list(labels), columns=list(labels)) + +def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2): + """Build weighted edges > threshold.""" + edges = [] + for src in P.index: + for dst in P.columns: + w = float(P.loc[src, dst]) + if w > threshold: + edges.append((str(src), str(dst), f"{w:.{round_digits}f}")) + return edges + +def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt="svg", view=False): + """ + fname: output file stem (no extension) + matrix: NumPy array or pandas DataFrame of transition PROBABILITIES + ls_index: ordered labels (required if matrix is not a DataFrame) + threshold: hide edges with weight <= threshold + fmt: 'svg'|'png'|'pdf' etc. + view: open after rendering + """ + P = _as_prob_df(matrix, labels=ls_index) + edges = _df_to_edgelist(P, threshold=threshold) + + g = Digraph(format=fmt) + g.attr(rankdir="LR", size="30") + g.attr("node", shape="circle") + + # ensure isolated nodes appear + for node in P.index: + g.node(str(node), width="1", height="1") + + for src, dst, label in edges: + g.edge(src, dst, label=label) + + g.render(fname, view=view, cleanup=True) + return g + + +class TransitionProbMatrixTransformer(BaseEstimator, TransformerMixin): + def __init__(self, threshold=0.0): + self.threshold = threshold + self.P_ = None + self.labels_ = None + + def fit(self, X: pd.DataFrame, y=None): + P, labels = build_transition_prob_matrix(X) + self.P_ = P + self.labels_ = labels + return self + + def transform(self, X: pd.DataFrame = None): + return self.P_, self.labels_ + + def render(self, fname: str, fmt="svg", view=False): + if self.P_ is None or self.labels_ is None: + raise ValueError("Transformer has not been fitted yet.") + return render_graph( + fname, + self.P_, + ls_index=self.labels_, + threshold=self.threshold, + fmt=fmt, + view=view + ) + + +class SessionTransitionProbMatrixTransformer(BaseEstimator, TransformerMixin): + def __init__(self, threshold=0.0, session_col='sessionId'): + self.threshold = threshold + self.session_col = session_col + self.session_matrices_ = None + + def fit(self, X: pd.DataFrame, y=None): + if self.session_col not in X.columns: + raise ValueError(f"Column '{self.session_col}' not found in DataFrame") + + session_matrices = {} + for session_id, grp in X.groupby(self.session_col): + if len(grp) > 1: # need at least 2 events for transitions + P, labels = build_transition_prob_matrix(grp) + session_matrices[session_id] = {'matrix': P, 'labels': labels} + + self.session_matrices_ = session_matrices + return self + + def transform(self, X: pd.DataFrame = None): + if self.session_matrices_ is None: + raise ValueError("Transformer has not been fitted yet.") + return pd.Series(self.session_matrices_) + + def render_session(self, session_id: str, fname: str, fmt="svg", view=False): + if self.session_matrices_ is None: + raise ValueError("Transformer has not been fitted yet.") + if session_id not in self.session_matrices_: + raise ValueError(f"Session '{session_id}' not found in fitted data.") + + sess_data = self.session_matrices_[session_id] + return render_graph( + fname, + sess_data['matrix'], + ls_index=sess_data['labels'], + threshold=self.threshold, + fmt=fmt, + view=view + ) +if __name__ == "__main__": + # Example usage + data = { + 'eventName': [ + 'A', 'B', 'A', 'C', 'B', 'A', 'A', 'C', 'B', 'C', + 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A' + ] + } + df = pd.DataFrame(data) + + transformer = TransitionProbMatrixTransformer(threshold=0.1) + transformer.fit(df) + P, labels = transformer.transform(None) + + print("Transition Probability Matrix:") + print(pd.DataFrame(P, index=labels, columns=labels)) + + # Render the graph + transformer.render("transition_graph", fmt="svg", view=False) diff --git a/experiments/procesing/pipeline.py b/experiments/procesing/pipeline.py new file mode 100644 index 0000000..6b742b2 --- /dev/null +++ b/experiments/procesing/pipeline.py @@ -0,0 +1,19 @@ +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from extract import DataExtractor +from mapping import SessionTransitionProbMatrixTransformer, render_graph + + +if __name__ == "__main__": + steps = [ + ('data_extraction', DataExtractor()), + ('transition_matrix', SessionTransitionProbMatrixTransformer(threshold=0.05)), + ] + pipeline = Pipeline(steps) + result = pipeline.fit_transform(None) + print(f"Number of sessions: {len(result)}\n") + + for session_id, sess_data in result.items(): + fname = f"session_{session_id}" + render_graph(fname, sess_data['matrix'], ls_index=sess_data['labels'], threshold=0.05, fmt="svg", view=False) + print(f"Rendered {fname}.svg") diff --git a/paper/concat_code.sh b/paper/concat_code.sh index 2503458..abbd676 100755 --- a/paper/concat_code.sh +++ b/paper/concat_code.sh @@ -16,10 +16,11 @@ mkdir -p "$(dirname "$OUTPUT_FILE")" add_file() { local filepath="$1" local relpath="${filepath#$PROJECT_ROOT/}" + local escaped_path="${relpath//_/\\_}" # Add section header and code listing (no language-specific highlighting) - echo "\\subsection{${relpath}}" >> "$OUTPUT_FILE" - echo "\\begin{lstlisting}[caption={${relpath}}]" >> "$OUTPUT_FILE" + echo "\\subsection{${escaped_path}}" >> "$OUTPUT_FILE" + echo "\\begin{lstlisting}[caption={${escaped_path}}]" >> "$OUTPUT_FILE" cat "$filepath" >> "$OUTPUT_FILE" echo "" >> "$OUTPUT_FILE" echo "\\end{lstlisting}" >> "$OUTPUT_FILE" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..2122ae5 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths = experiments +python_files = test*.py +python_classes = Test* +python_functions = test_* +asyncio_mode = auto +asyncio_default_fixture_loop_scope = function diff --git a/requirements.txt b/requirements.txt index 99cb58e..8bb3ed7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ jupyter ipykernel matplotlib graphviz +browser-use +pytest +pytest-asyncio +uv +scikit-learn