diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb index 7697184..52390c0 100644 --- a/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/.ipynb_checkpoints/main-checkpoint.ipynb @@ -11,6 +11,49 @@ "import pandas as pd\n", "from tms_data_interface import SQLQueryInterface\n", "\n", + "def apply_sar_flag(df, var1, var2, var3, random_state=42):\n", + " \"\"\"\n", + " Apply percentile-based thresholds, split data into alerting and non-alerting,\n", + " flag random 10% of alerting data as 'Y', and merge back.\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): Input dataframe\n", + " var1 (str): First variable (for 50th percentile threshold)\n", + " var2 (str): Second variable (for 50th percentile threshold)\n", + " var3 (str): Third variable (for 90th percentile threshold)\n", + " random_state (int): Seed for reproducibility\n", + "\n", + " Returns:\n", + " pd.DataFrame: DataFrame with 'SAR_Flag' column added\n", + " \"\"\"\n", + "\n", + " # Calculate thresholds\n", + " th1 = np.percentile(df[var1].dropna(), 50)\n", + " th2 = np.percentile(df[var2].dropna(), 50)\n", + " th3 = np.percentile(df[var3].dropna(), 90)\n", + "\n", + " # Split into alerting and non-alerting\n", + " alerting = df[(df[var1] >= th1) &\n", + " (df[var2] >= th2) &\n", + " (df[var3] >= th3)].copy()\n", + "\n", + " non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n", + "\n", + " # Assign SAR_Flag = 'N' for non-alerting\n", + " non_alerting['SAR_Flag'] = 'N'\n", + "\n", + " # Assign SAR_Flag for alerting data\n", + " alerting['SAR_Flag'] = 'N'\n", + " n_y = int(len(alerting) * 0.1) # 10% count\n", + " if n_y > 0:\n", + " y_indices = alerting.sample(n=n_y, random_state=random_state).index\n", + " alerting.loc[y_indices, 'SAR_Flag'] = 'Y'\n", + "\n", + " # Merge back and preserve original order\n", + " final_df = pd.concat([alerting, non_alerting]).sort_index()\n", + "\n", + " return final_df\n", + "\n", "query = \"\"\"\n", "WITH time_windows AS (\n", " SELECT\n", @@ -107,9 +150,14 @@ " final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n", " final_scenario_df['TOTAL_VOLUME'] * 100\n", " final_scenario_df['Segment'] = 'Default'\n", - " final_scenario_df['SAR_FLAG'] = 'N'\n", + " # final_scenario_df['SAR_FLAG'] = 'N'\n", " final_scenario_df['Risk'] = 'Medium Risk'\n", " final_scenario_df.dropna(inplace=True)\n", + " final_scenario_df = apply_sar_flag(final_scenario_df,\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'TOTAL_VOLUME',\n", + " random_state=42)\n", " # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n", " return final_scenario_df\n" ] diff --git a/main.ipynb b/main.ipynb index 7697184..52390c0 100644 --- a/main.ipynb +++ b/main.ipynb @@ -11,6 +11,49 @@ "import pandas as pd\n", "from tms_data_interface import SQLQueryInterface\n", "\n", + "def apply_sar_flag(df, var1, var2, var3, random_state=42):\n", + " \"\"\"\n", + " Apply percentile-based thresholds, split data into alerting and non-alerting,\n", + " flag random 10% of alerting data as 'Y', and merge back.\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): Input dataframe\n", + " var1 (str): First variable (for 50th percentile threshold)\n", + " var2 (str): Second variable (for 50th percentile threshold)\n", + " var3 (str): Third variable (for 90th percentile threshold)\n", + " random_state (int): Seed for reproducibility\n", + "\n", + " Returns:\n", + " pd.DataFrame: DataFrame with 'SAR_Flag' column added\n", + " \"\"\"\n", + "\n", + " # Calculate thresholds\n", + " th1 = np.percentile(df[var1].dropna(), 50)\n", + " th2 = np.percentile(df[var2].dropna(), 50)\n", + " th3 = np.percentile(df[var3].dropna(), 90)\n", + "\n", + " # Split into alerting and non-alerting\n", + " alerting = df[(df[var1] >= th1) &\n", + " (df[var2] >= th2) &\n", + " (df[var3] >= th3)].copy()\n", + "\n", + " non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n", + "\n", + " # Assign SAR_Flag = 'N' for non-alerting\n", + " non_alerting['SAR_Flag'] = 'N'\n", + "\n", + " # Assign SAR_Flag for alerting data\n", + " alerting['SAR_Flag'] = 'N'\n", + " n_y = int(len(alerting) * 0.1) # 10% count\n", + " if n_y > 0:\n", + " y_indices = alerting.sample(n=n_y, random_state=random_state).index\n", + " alerting.loc[y_indices, 'SAR_Flag'] = 'Y'\n", + "\n", + " # Merge back and preserve original order\n", + " final_df = pd.concat([alerting, non_alerting]).sort_index()\n", + "\n", + " return final_df\n", + "\n", "query = \"\"\"\n", "WITH time_windows AS (\n", " SELECT\n", @@ -107,9 +150,14 @@ " final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n", " final_scenario_df['TOTAL_VOLUME'] * 100\n", " final_scenario_df['Segment'] = 'Default'\n", - " final_scenario_df['SAR_FLAG'] = 'N'\n", + " # final_scenario_df['SAR_FLAG'] = 'N'\n", " final_scenario_df['Risk'] = 'Medium Risk'\n", " final_scenario_df.dropna(inplace=True)\n", + " final_scenario_df = apply_sar_flag(final_scenario_df,\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'TOTAL_VOLUME',\n", + " random_state=42)\n", " # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n", " return final_scenario_df\n" ] diff --git a/main.py b/main.py index 7bd671e..af5c963 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,49 @@ from datetime import datetime, timedelta import pandas as pd from tms_data_interface import SQLQueryInterface +def apply_sar_flag(df, var1, var2, var3, random_state=42): + """ + Apply percentile-based thresholds, split data into alerting and non-alerting, + flag random 10% of alerting data as 'Y', and merge back. + + Parameters: + df (pd.DataFrame): Input dataframe + var1 (str): First variable (for 50th percentile threshold) + var2 (str): Second variable (for 50th percentile threshold) + var3 (str): Third variable (for 90th percentile threshold) + random_state (int): Seed for reproducibility + + Returns: + pd.DataFrame: DataFrame with 'SAR_Flag' column added + """ + + # Calculate thresholds + th1 = np.percentile(df[var1].dropna(), 50) + th2 = np.percentile(df[var2].dropna(), 50) + th3 = np.percentile(df[var3].dropna(), 90) + + # Split into alerting and non-alerting + alerting = df[(df[var1] >= th1) & + (df[var2] >= th2) & + (df[var3] >= th3)].copy() + + non_alerting = df.loc[~df.index.isin(alerting.index)].copy() + + # Assign SAR_Flag = 'N' for non-alerting + non_alerting['SAR_Flag'] = 'N' + + # Assign SAR_Flag for alerting data + alerting['SAR_Flag'] = 'N' + n_y = int(len(alerting) * 0.1) # 10% count + if n_y > 0: + y_indices = alerting.sample(n=n_y, random_state=random_state).index + alerting.loc[y_indices, 'SAR_Flag'] = 'Y' + + # Merge back and preserve original order + final_df = pd.concat([alerting, non_alerting]).sort_index() + + return final_df + query = """ WITH time_windows AS ( SELECT @@ -104,9 +147,14 @@ class Scenario: final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\ final_scenario_df['TOTAL_VOLUME'] * 100 final_scenario_df['Segment'] = 'Default' - final_scenario_df['SAR_FLAG'] = 'N' + # final_scenario_df['SAR_FLAG'] = 'N' final_scenario_df['Risk'] = 'Medium Risk' final_scenario_df.dropna(inplace=True) + final_scenario_df = apply_sar_flag(final_scenario_df, + 'PRICE_CHANGE_PCT', + 'PARTICIPANT_VOLUME_PCT', + 'TOTAL_VOLUME', + random_state=42) # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE'] return final_scenario_df