From c5caee21b168b9ca05b0fe3990f9b9c8de5c6f33 Mon Sep 17 00:00:00 2001 From: Daniel Rosel Date: Sun, 14 Dec 2025 17:59:34 +0100 Subject: [PATCH] formlating the reward simply --- sim/rl/environment.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sim/rl/environment.py b/sim/rl/environment.py index a09438f..ca7159b 100644 --- a/sim/rl/environment.py +++ b/sim/rl/environment.py @@ -40,7 +40,7 @@ class CommercePlatform: 'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(), } - def run_pricing_simulation(self, prices: np.ndarray) -> np.ndarray: + def run_pricing_simulation(self, prices: np.ndarray) -> dict: # Simulate demand based on prices observed_demand, demand_from_agents = self.setup_true_demand(prices) @@ -51,16 +51,17 @@ class CommercePlatform: demand_estimates = self.demand_estimate(interaction_data) internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6) - self.simulation_history.append( - { + + summary = { 'prices': prices, 'true_demand': true_demand, 'demand_estimates': demand_estimates, 'internal_error': internal_error, 'interaction_data': interaction_data, 'interaction_features': interaction_features - }) - return np.array(interaction_data) + } + self.simulation_history.append(summary) + return summary def get_interaction_data(self) -> np.ndarray: # Simulate interaction data @@ -118,10 +119,24 @@ class PHANTOMEnv(gym.Env): self.constraints.system_min_price, self.constraints.system_max_price) + result = self.commerce_platform.run_pricing_simulation(self.state['price']) + history = self.commerce_platform.simulation_history + self.state['demand'] = result['demand_estimates'] + + + + reward = sum( + self.state['price'] * self.state['demand'], + # performance historically, to take into account business kpi trends (using features from interaction data) + sum( + [-0.05 * i * history[-1]['internal_error'] for i in range(1, len(history))], + ) if len(history) > 1 else 0, + sum( + [0.1 * history[-1]['interaction_features']['mean_sale_price'] - 0.1 * history[i]['interaction_features']['mean_sale_price'] for i in range(len(history)-1)], + ) if len(history) > 1 else 0 + ) - # Calculate reward (e.g., revenue) - reward = new_price * demand # Check if episode is done done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0