Merge branch 'agent-behavior-loader-developemen' into feat-strong-learning-implementation-with-data-contamination

2026-07-16 01:53:37 +00:00 · 2026-01-31 10:08:59 +01:00
parent 26abff5864 72877439ca
commit 2f481bd94b
25 changed files with 1205 additions and 117 deletions
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -18,8 +18,6 @@ try:
 except ImportError:
    lib_make_state_repr = None
    lib_transition_histogram = None
-    print("lib no includable")
-


 class BehaviorModel:
@@ -226,7 +224,7 @@ if __name__ == "__main__":

    agent_model = AgentBehaviorModel(agent_dir)
    agent_mdp = agent_model.build_MDP()
-    print(agent_mdp)
+
    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
    if not agent_mdp['states']:
@@ -235,8 +233,6 @@ if __name__ == "__main__":

    human_evt = aggregate_event_transitions(human_mdp)
    agent_evt = aggregate_event_transitions(agent_mdp)
-    print(agent_evt)
-

    common = set(human_evt.keys()) & set(agent_evt.keys())

--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -1,10 +1,10 @@
+from os import kill
 import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
 from typing import Dict, Any
 from sim.rl.environment import BusinessLogicConstraints

-
 """
 An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
 From these features we then follow the researc hstructure of q -> p with a testable and must be updatable mechanism.
@@ -39,6 +39,7 @@ class BasePricingEngine(ABC):



+
    def reset(self):
        """reset engine state for new episode"""
        self.step_count = 0
@@ -68,15 +69,16 @@ class WildPricingEngine(BasePricingEngine):

    def reset(self):
        super().reset()
-        self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
-        self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.cov_pq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.var_p = np.ones(self.c.product_catalogue_size, dtype=np.float32)
+        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)

    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
        self.step_count += 1
-        demand = _extract_demand(observation, self.c.product_catalogue_size)
+        # extract demand signal (from env observation) as proxy for sales
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
        return self._update_from_demand(current_prices, demand)

    def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray: