Source code for nabqr.nabqr

from .functions import *
from .helper_functions import simulate_correlated_ar1_process, set_n_closest_to_zero
import matplotlib.pyplot as plt
import scienceplots

plt.style.use(["no-latex"])
from .visualization import visualize_results
import datetime as dt



[docs]
def run_nabqr_pipeline(
    n_samples=2000,
    phi=0.995,
    sigma=8,
    offset_start=10,
    offset_end=500,
    offset_step=15,
    correlation=0.8,
    data_source="NABQR-TEST",
    training_size=0.7,
    epochs=20,
    timesteps=[0, 1, 2, 6, 12, 24],
    quantiles=[0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    X=None,
    actuals=None,
    simulation_type="sde",
    visualize = True,
    taqr_limit=5000,
    save_files = True,
):
    """
    Run the complete NABQR pipeline, which may include data simulation, model training,
    and visualization. The user can either provide pre-computed inputs (X, actuals)
    or opt to simulate data if both are not provided.

    Parameters
    ----------
    n_samples : int, optional
        Number of time steps to simulate if no data provided, by default 5000.
    phi : float, optional
        AR(1) coefficient for simulation, by default 0.995.
    sigma : float, optional
        Standard deviation of noise for simulation, by default 8.
    offset_start : int, optional
        Start value for offset range, by default 10.
    offset_end : int, optional
        End value for offset range, by default 500.
    offset_step : int, optional
        Step size for offset range, by default 15.
    correlation : float, optional
        Base correlation between dimensions, by default 0.8.
    data_source : str, optional
        Identifier for the data source, by default "NABQR-TEST".
    training_size : float, optional
        Proportion of data to use for training, by default 0.7.
    epochs : int, optional
        Number of epochs for model training, by default 100.
    timesteps : list, optional
        List of timesteps to use for LSTM, by default [0, 1, 2, 6, 12, 24].
    quantiles : list, optional
        List of quantiles to predict, by default [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99].
    X : array-like, optional
        Pre-computed input features. If not provided along with `actuals`, the function
        will prompt to simulate data.
    actuals : array-like, optional
        Pre-computed actual target values. If not provided along with `X`, the function
        will prompt to simulate data.
    simulation_type : str, optional
        Type of simulation to use, by default "ar1". "sde" is more advanced and uses a SDE model and realistic.
    visualize : bool, optional
        Determines if any visual elements will be plotted to the screen or saved as figures.
    taqr_limit : int, optional
        The lookback limit for the TAQR model, by default 5000.
    save_files : bool, optional
        Determines if any files will be saved, by default True. Note: the R-file needs to save some .csv files to run properly.
    Returns
    -------
    tuple
        A tuple containing:

        - corrected_ensembles: pd.DataFrame
            The corrected ensemble predictions.
        - taqr_results: list of numpy.ndarray
            The TAQR results.
        - actuals_output: list of numpy.ndarray
            The actual output values.
        - BETA_output: list of numpy.ndarray
            The BETA parameters.
        - scores: pd.DataFrame
            The scores for the predictions and original/corrected ensembles.

    Raises
    ------
    ValueError
        If user opts not to simulate data when both X and actuals are missing.
    """

    # If both X and actuals are not provided, ask user if they want to simulate
    if X is None or actuals is None:
        if X is not None or actuals is not None:
            raise ValueError("Either provide both X and actuals, or none at all.")
        choice = (
            input(
                "X and actuals are not provided. Do you want to simulate data? (y/n): "
            )
            .strip()
            .lower()
        )
        if choice != "y":
            raise ValueError(
                "Data was not provided and simulation not approved. Terminating function."
            )

        # Generate offset and correlation matrix for simulation
        offset = np.arange(offset_start, offset_end, offset_step)
        m = len(offset)
        corr_matrix = correlation * np.ones((m, m)) + (1 - correlation) * np.eye(m)

        # Generate simulated data
        # Check if simulation_type is valid
        if simulation_type not in ["ar1", "sde"]:
            raise ValueError("Invalid simulation type. Please choose 'ar1' or 'sde'.")
        if simulation_type == "ar1":    
            X, actuals = simulate_correlated_ar1_process(
                n_samples, phi, sigma, m, corr_matrix, offset, smooth=5
            )
        elif simulation_type == "sde":
            initial_params = {
                    'X0': 0.6,
                    'theta': 0.77,
                    'kappa': 0.12,        # Slower mean reversion
                    'sigma_base': 1.05,  # Lower base volatility
                    'alpha': 0.57,       # Lower ARCH effect
                    'beta': 1.2,        # High persistence
                    'lambda_jump': 0.045, # Fewer jumps
                    'jump_mu': 0.0,     # Negative jumps
                    'jump_sigma': 0.1    # Moderate jump size variation
                }
            # Check that initial parameters are within bounds
            bounds = get_parameter_bounds()
            for param, value in initial_params.items():
                lower_bound, upper_bound = bounds[param]
                if not (lower_bound <= value <= upper_bound):
                    print(f"Initial parameter {param}={value} is out of bounds ({lower_bound}, {upper_bound})")
                    if value < lower_bound:
                        initial_params[param] = lower_bound
                    else:
                        initial_params[param] = upper_bound
            
            t, actuals, X = simulate_wind_power_sde(
                initial_params, T=n_samples, dt=1.0
            )



        # Plot the simulated data with X in shades of blue and actuals in bold black
        plt.figure(figsize=(10, 6))
        cmap = plt.cm.Blues
        num_series = X.shape[1] if X.ndim > 1 else 1
        colors = [cmap(i) for i in np.linspace(0.3, 1, num_series)]  # Shades of blue
        if num_series > 1:
            for i in range(num_series):
                plt.plot(X[:, i], color=colors[i], alpha=0.7)
        else:
            plt.plot(X, color=colors[0], alpha=0.7)
        plt.plot(actuals, color="black", linewidth=2, label="Actuals")
        plt.title("Simulated Data")
        plt.xlabel("Time")
        plt.ylabel("Value")
        plt.legend()
        plt.show()

    # Run the pipeline
    corrected_ensembles, taqr_results, actuals_output, BETA_output, X_ensembles = pipeline(
        X,
        actuals,
        data_source,
        training_size=training_size,
        epochs=epochs,
        timesteps_for_lstm=timesteps,
        quantiles_taqr=quantiles,
        limit=taqr_limit,
        save_files = save_files
    )

    # Get today's date for file naming
    today = dt.datetime.today().strftime("%Y-%m-%d")

    # Visualize results
    if visualize:
        visualize_results(actuals_output, taqr_results, f"{data_source} example")

    # Calculate scores
    scores = calculate_scores(
        actuals_output,
        taqr_results,
        X_ensembles,
        corrected_ensembles,
        quantiles,
        data_source,
        plot_reliability=True,
        visualize = visualize
    )

    return corrected_ensembles, taqr_results, actuals_output, BETA_output, scores



if __name__ == "__main__":
    run_nabqr_pipeline()