OokamiPupV2/modules/utility.py

import time
import os
import random
import json
import re

try:
    # 'regex' on PyPI supports `\p{L}`, `\p{N}`, etc.
    import regex
    USE_REGEX_LIB = True
except ImportError:
    # Fallback to Python's built-in 're' if 'regex' isn't installed
    USE_REGEX_LIB = False

DICTIONARY_PATH = "dictionary/"  # Path to dictionary files

def format_uptime(seconds: float) -> tuple[str, int]:
    """
    Convert seconds into a human-readable string:
    - Example outputs:
      "32 minutes"
      "8 days, 4 hours"
      "1 year, 3 months"
    - Returns a tuple:
      (Human-readable string, total seconds)
    """
    seconds = int(seconds)  # Ensure integer seconds

    # Define time units
    units = [
        ("year", 31536000),   # 365 days
        ("month", 2592000),   # 30 days
        ("day", 86400),       # 24 hours
        ("hour", 3600),       # 60 minutes
        ("minute", 60),
        ("second", 1)
    ]

    # Compute time breakdown
    time_values = []
    for unit_name, unit_seconds in units:
        value, seconds = divmod(seconds, unit_seconds)
        if value > 0:
            time_values.append(f"{value} {unit_name}{'s' if value > 1 else ''}")  # Auto pluralize

    # Return only the **two most significant** time units (e.g., "3 days, 4 hours")
    return (", ".join(time_values[:2]), seconds) if time_values else ("0 seconds", 0)

def get_random_reply(dictionary_name: str, category: str, **variables) -> str:
    """
    Fetches a random string from a given dictionary and category.
    Supports variable substitution using keyword arguments.

    :param dictionary_name: The name of the dictionary file (without .json)
    :param category: The category (key) inside the dictionary to fetch a response from
    :param variables: Keyword arguments to replace placeholders in the string
    :return: A formatted string with the variables replaced
    """
    file_path = os.path.join(DICTIONARY_PATH, f"{dictionary_name}.json")

    # Ensure file exists
    if not os.path.exists(file_path):
        return f"[Error: Missing {dictionary_name}.json]"

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
    except json.JSONDecodeError:
        return f"[Error: Failed to load {dictionary_name}.json]"

    # Ensure category exists
    if category not in data or not isinstance(data[category], list):
        return f"[Error: No valid entries for {category} in {dictionary_name}.json]"

    # Select a random reply
    response = random.choice(data[category])

    # Replace placeholders with provided variables
    return response.format(**variables)

##############################
# Basic sanitization
# DO NOT RELY SOLELY ON THIS
##############################
def sanitize_user_input(
    user_input: str,
    usage: str = "GENERAL",
    max_length: int = 500
):
    """
    A whitelisting-based function for sanitizing user input.

    Returns a tuple of:
      (sanitized_str, sanitization_applied_bool, sanitization_reason, original_str)

    :param user_input: The raw string from the user (e.g., from Twitch or Discord).
    :param usage:
        - 'CALC': Keep digits, math operators, parentheses, etc.
        - 'GENERAL': Keep typical readable characters & punctuation.
    :param max_length: Truncate the input if it exceeds this length.
    :return: (sanitized_str, bool, reason_string, original_str)

    ======================
    SECURITY RECOMMENDATIONS
    ======================
    1) For database storage (MariaDB, etc.):
       - **Always** use parameterized queries or an ORM with bound parameters.
       - Do not rely solely on string sanitization to prevent SQL injection.

    2) For code execution (e.g., 'eval'):
       - Avoid using eval/exec on user input.
       - If you must, consider a restricted math parser or an audited sandbox.

    3) For HTML sanitization:
       - Bleach is deprecated; research modern alternatives or frameworks that
         safely sanitize HTML output. This function does *not* sanitize HTML tags.
    """

    original_string = str(user_input)
    reasons = []
    sanitization_applied = False

    # 1. Truncate and remove newlines, tabs, etc.
    truncated = original_string[:max_length]
    truncated = re.sub(r"[\r\n\t]+", " ", truncated)

    sanitized = truncated

    # 2. Choose how to filter based on usage
    usage = usage.upper()

    if usage == "CALC":
        # Allow digits, +, -, *, /, %, parentheses, decimal points, ^ for exponent, spaces
        # Remove everything else
        pattern = r"[^0-9+\-*/%().^ \t]"
        new_sanitized = re.sub(pattern, "", sanitized)
        if new_sanitized != sanitized:
            sanitization_applied = True
            reasons.append("CALC: Removed non-math characters.")
        sanitized = new_sanitized

    else:  # GENERAL usage
        if USE_REGEX_LIB:
            # Remove ASCII control chars (0-31, 127) first
            step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
            # Then apply a fairly broad whitelist:
            # \p{L}: letters; \p{N}: numbers; \p{P}: punctuation; \p{S}: symbols; \p{Z}: separators (including spaces).
            # This keeps emojis, foreign characters, typical punctuation, etc.
            pattern = r"[^\p{L}\p{N}\p{P}\p{S}\p{Z}]"
            new_sanitized = regex.sub(pattern, "", step1)

            if new_sanitized != sanitized:
                sanitization_applied = True
                reasons.append("GENERAL: Removed disallowed chars via regex.")
            sanitized = new_sanitized
        else:
            # Fallback: If 'regex' is not installed, remove control chars and keep ASCII printable only.
            step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
            pattern = r"[^ -~]"  # Keep only ASCII 32-126
            new_sanitized = re.sub(pattern, "", step1)

            if new_sanitized != sanitized:
                sanitization_applied = True
                reasons.append("GENERAL: Removed non-ASCII or control chars (fallback).")
            sanitized = new_sanitized

    # 3. Final trim
    sanitized = sanitized.strip()

    # 4. Prepare output
    reason_string = "; ".join(reasons)
    return (sanitized, sanitization_applied, reason_string, original_string)