mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
feat: wip contaminator
This commit is contained in:
44
experiments/procesing/contaminator.py
Normal file
44
experiments/procesing/contaminator.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import random
|
||||||
|
from sim.rl.behavior_loader import AgentBehaviorModel
|
||||||
|
|
||||||
|
base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
|
||||||
|
human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def remap_schema(df : pd.DataFrame, mapping: dict, on: str = "event_type"):
|
||||||
|
df = df.copy()
|
||||||
|
df[on] = df[on].map(mapping).fillna(df[on])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
|
||||||
|
contamination_rate: float = 0.1) -> pd.DataFrame:
|
||||||
|
model = AgentBehaviorModel(agent_dir)
|
||||||
|
target_df_schema = df[on].unique().tolist()
|
||||||
|
mapping = {
|
||||||
|
'view': 'view_page'
|
||||||
|
# TODO: define properly for the given dataset
|
||||||
|
}
|
||||||
|
OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
|
||||||
|
# normalize to weights
|
||||||
|
OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}
|
||||||
|
mapped_df = remap_schema(df, mapping, on=on)
|
||||||
|
N = len(df)
|
||||||
|
N_final = N / (1 - contamination_rate) # TODO: explain this in paper
|
||||||
|
N_contaminate = int(N_final - N)
|
||||||
|
start_event_types = random.choices(list(OG_event_distribution.keys()),
|
||||||
|
weights=list(OG_event_distribution.values()), k=N_contaminate)
|
||||||
|
# it makes sense
|
||||||
|
new_trajectories = []
|
||||||
|
for start_event in start_event_types:
|
||||||
|
# sample from og start
|
||||||
|
start = None # TODO: defin start accoding to dataset (randomly sample with weights of event distr)
|
||||||
|
trajectory = model.sample_trajectory(start) # TODO: explain this method in paper
|
||||||
|
new_trajectories.extend(trajectory)
|
||||||
|
|
||||||
|
# TODO: make sure the new trajctories schema conforms with dataset
|
||||||
|
contaminate_df = pd.DataFrame(new_trajectories)
|
||||||
|
df = pd.concat([df, contaminate_df], ignore_index=True)
|
||||||
|
return df
|
||||||
Reference in New Issue
Block a user