{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "d3b830ff-94b7-4596-b85a-320060180a6b", "metadata": {}, "outputs": [], "source": [ "from datetime import datetime, timedelta\n", "import pandas as pd\n", "import numpy as np\n", "from tms_data_interface import SQLQueryInterface\n", "\n", "def apply_sar_flag(df, var1, var2, var3, random_state=42):\n", " \"\"\"\n", " Apply percentile-based thresholds, split data into alerting and non-alerting,\n", " flag random 10% of alerting data as 'Y', and merge back.\n", "\n", " Parameters:\n", " df (pd.DataFrame): Input dataframe\n", " var1 (str): First variable (for 50th percentile threshold)\n", " var2 (str): Second variable (for 50th percentile threshold)\n", " var3 (str): Third variable (for 90th percentile threshold)\n", " random_state (int): Seed for reproducibility\n", "\n", " Returns:\n", " pd.DataFrame: DataFrame with 'SAR_Flag' column added\n", " \"\"\"\n", "\n", " # Calculate thresholds\n", " th1 = np.percentile(df[var1].dropna(), 90)\n", " th2 = np.percentile(df[var2].dropna(), 90)\n", " th3 = np.percentile(df[var3].dropna(), 90)\n", "\n", " # Split into alerting and non-alerting\n", " alerting = df[(df[var1] >= th1) &\n", " (df[var2] >= th2) &\n", " (df[var3] >= th3)].copy()\n", "\n", " non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n", "\n", " # Assign SAR_Flag = 'N' for non-alerting\n", " non_alerting['SAR_FLAG'] = 'N'\n", "\n", " # Assign SAR_Flag for alerting data\n", " alerting['SAR_FLAG'] = 'N'\n", " n_y = int(len(alerting) * 0.1) # 10% count\n", " if n_y > 0:\n", " y_indices = alerting.sample(n=n_y, random_state=random_state).index\n", " alerting.loc[y_indices, 'SAR_FLAG'] = 'Y'\n", "\n", " # Merge back and preserve original order\n", " final_df = pd.concat([alerting, non_alerting]).sort_index()\n", "\n", " return final_df\n", "\n", "query = \"\"\"\n", "WITH time_windows AS (\n", " SELECT\n", " -- End time is the current trade time\n", " date_time AS end_time,\n", "\n", " -- Subtract seconds from the end_time using date_add() with negative integer interval\n", " date_add('second', -{time_window_s}, date_time) AS start_time,\n", "\n", " -- Trade details\n", " trade_price,\n", " trade_volume,\n", " trader_id,\n", "\n", " -- Calculate minimum price within the time window\n", " MIN(trade_price) OVER (\n", " ORDER BY date_time \n", " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", " ) AS min_price,\n", "\n", " -- Calculate maximum price within the time window\n", " MAX(trade_price) OVER (\n", " ORDER BY date_time \n", " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", " ) AS max_price,\n", "\n", " -- Calculate total trade volume within the time window\n", " SUM(trade_volume) OVER ( \n", " ORDER BY date_time \n", " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", " ) AS total_volume,\n", "\n", " -- Calculate participant's trade volume within the time window\n", " SUM(CASE WHEN trader_id = trader_id THEN trade_volume ELSE 0 END) OVER (\n", " PARTITION BY trader_id \n", " ORDER BY date_time \n", " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", " ) AS participant_volume\n", " FROM\n", " {trade_data_1b}\n", ")\n", "SELECT\n", " -- Select the time window details\n", " start_time,\n", " end_time,\n", "\n", " -- Select the participant (trader) ID\n", " trader_id AS \"Participant\",\n", "\n", " -- Select the calculated min and max prices\n", " min_price,\n", " max_price,\n", "\n", " -- Calculate the price change percentage\n", " (max_price - min_price) / NULLIF(min_price, 0) * 100 AS \"Price Change (%)\",\n", "\n", " -- Calculate the participant's volume as a percentage of total volume\n", " (participant_volume / NULLIF(total_volume, 0)) * 100 AS \"Volume (%)\",\n", "\n", " -- Participant volume\n", " participant_volume,\n", "\n", " -- Select the total volume within the window\n", " total_volume AS \"Total Volume\"\n", "FROM\n", " time_windows\n", "\"\"\"\n", "\n", "\n", "from tms_data_interface import SQLQueryInterface\n", "\n", "class Scenario:\n", " seq = SQLQueryInterface(schema=\"trade_schema\")\n", " def logic(self, **kwargs):\n", " validation_window = kwargs.get('validation_window', 300000)\n", " time_window_s = int(validation_window/1000)\n", " query_start_time = datetime.now()\n", " print(\"Query start time :\",query_start_time)\n", " row_list = self.seq.execute_raw(query.format(trade_data_1b=\"trade_10m_v3\",\n", " time_window_s = time_window_s)\n", " )\n", " cols = [\n", " 'START_DATE_TIME',\n", " 'END_DATE_TIME',\n", " 'Focal_id',\n", " 'MIN_PRICE',\n", " 'MAX_PRICE',\n", " 'PRICE_CHANGE_PCT',\n", " 'PARTICIPANT_VOLUME_PCT',\n", " 'PARTICIPANT_VOLUME',\n", " 'TOTAL_VOLUME',\n", " ]\n", " final_scenario_df = pd.DataFrame(row_list, columns = cols)\n", " final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n", " final_scenario_df['TOTAL_VOLUME'] * 100\n", " final_scenario_df['Segment'] = 'Default'\n", " # final_scenario_df['SAR_FLAG'] = 'N'\n", " final_scenario_df['Risk'] = 'Medium Risk'\n", " final_scenario_df.dropna(inplace=True)\n", " final_scenario_df = apply_sar_flag(final_scenario_df,\n", " 'PRICE_CHANGE_PCT',\n", " 'PARTICIPANT_VOLUME_PCT',\n", " 'TOTAL_VOLUME',\n", " random_state=42)\n", " # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n", " return final_scenario_df\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }