BoostAI/Mock-Data/generate.py

"""
Mock pupil-history dataset generator for the BoostAI "Learning Path Agent"
hackathon challenge.

Outputs JSON files under the same directory, mirroring the production
SQLAlchemy schema in elevenplus-backend/src/app/models/. The generator is
deterministic (seeded RNG) so re-runs produce identical output.

Run:
    python3 generate.py

Outputs:
    classroom.json, students.json, question_bank.json, assignments.json,
    assignment_questions.json, assignment_assignees.json,
    student_answers.json, activity_logs.json, dataset.json
"""
from __future__ import annotations

import json
import os
import random
from datetime import datetime, timedelta, timezone
from pathlib import Path

# Reference date — keep in sync with the hackathon brief.
TODAY = datetime(2026, 5, 1, 9, 0, 0, tzinfo=timezone.utc)

OUT_DIR = Path(__file__).parent
RNG = random.Random(20260501)


def ms(dt: datetime) -> int:
    return int(dt.timestamp() * 1000)


def days_ago(n: float, hour: int = 10, minute: int = 0) -> datetime:
    return (TODAY - timedelta(days=n)).replace(hour=hour, minute=minute, second=0, microsecond=0)


# ---------------------------------------------------------------------------
# Classroom + Tutor + Students
# ---------------------------------------------------------------------------

TUTOR = {
    "id": 100,
    "fullname": "Sarah Johnson",
    "email": "sarah.johnson@boostai.example",
    "username": "sjohnson",
    "role": "tutor",
    "active": True,
    "is_test": False,
    "is_deleted": False,
    "created_at": ms(days_ago(180)),
    "updated_at": ms(days_ago(180)),
}

CLASSROOM = {
    "id": 500,
    "name": "Year 6 — Maths Set 1",
    "organization_id": 1,
    "tutor_id": TUTOR["id"],
    "invite_code": "Y6MATHS1",
    "target_level": 6,
    "archived": False,
    "hide_just_answer": False,
    "is_deleted": False,
    "created_at": ms(days_ago(60)),
    "updated_at": ms(days_ago(60)),
}

# 12 students. _persona is a hackathon annotation (not in the production
# schema) — used to drive answer generation below and to document expected
# agent output. Strip the underscore-prefixed fields if seeding the real DB.
STUDENTS_RAW = [
    (201, "Aisha Khan",       "fraction_inversion"),
    (202, "Ben Carter",       "place_value_gaps"),
    (203, "Chen Wei",         "rushed_careless"),
    (204, "Daniela Rossi",    "solve_together_dependent"),
    (205, "Elif Demir",       "word_problem_weak"),
    (206, "Felix Brown",      "stable_strong"),
    (207, "Grace Park",       "stable_strong"),
    (208, "Harry Singh",      "stable_mid"),
    (209, "Isla Nakamura",    "stable_mid"),
    (210, "Jaden Williams",   "stable_mid"),
    (211, "Kira Patel",       "stable_weak"),
    (212, "Liam O'Connor",    "stable_weak"),
]

STUDENTS = []
CLASSROOM_STUDENT_RS = []
for sid, fullname, persona in STUDENTS_RAW:
    first = fullname.split()[0].lower().replace("'", "")
    STUDENTS.append({
        "id": sid,
        "fullname": fullname,
        "email": f"{first}.{sid}@boostai.example",
        "username": f"{first}{sid}",
        "role": "student",
        "active": True,
        "is_test": False,
        "is_deleted": False,
        "created_at": ms(days_ago(55)),
        "updated_at": ms(days_ago(55)),
        "_persona": persona,
    })
    CLASSROOM_STUDENT_RS.append({
        "id": 600 + sid,
        "classroom_id": CLASSROOM["id"],
        "student_id": sid,
        "created_at": ms(days_ago(55)),
    })


# ---------------------------------------------------------------------------
# Question Bank — Maths, 11+
#   Fields mirror question_bank.py.
#   _wrong_answer_map: hackathon helper — for misconception personas, the
#   answer they typically produce. Not in production schema.
# ---------------------------------------------------------------------------

QUESTION_BANK = [
    # ---- Place Value (tens, hundreds, thousands, decimals) ----
    {"id": 1001, "topic": "Place Value", "sub_topic": "Multi-digit numbers", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is the value of the digit 7 in the number 4,732?",
     "correct_answer": "700",
     "_wrong_answers": {"place_value_gaps": "70"}},
    {"id": 1002, "topic": "Place Value", "sub_topic": "Multi-digit numbers", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Round 24,587 to the nearest thousand.",
     "correct_answer": "25000",
     "_wrong_answers": {"place_value_gaps": "24000"}},
    {"id": 1003, "topic": "Place Value", "sub_topic": "Decimals", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Write 0.07 as a fraction in its simplest form.",
     "correct_answer": "7/100",
     "_wrong_answers": {"place_value_gaps": "7/10"}},
    {"id": 1004, "topic": "Place Value", "sub_topic": "Decimals", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is 0.3 + 0.07?",
     "correct_answer": "0.37",
     "_wrong_answers": {"place_value_gaps": "0.10"}},

    # ---- Arithmetic ----
    {"id": 1101, "topic": "Arithmetic", "sub_topic": "Addition", "tag": None,
     "difficulty": "EASY",
     "question_text": "Calculate 246 + 137.",
     "correct_answer": "383",
     "_wrong_answers": {"rushed_careless": "373"}},
    {"id": 1102, "topic": "Arithmetic", "sub_topic": "Subtraction", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Calculate 503 - 47.",
     "correct_answer": "456",
     "_wrong_answers": {"place_value_gaps": "544", "rushed_careless": "466"}},
    {"id": 1103, "topic": "Arithmetic", "sub_topic": "Subtraction", "tag": None,
     "difficulty": "HARD",
     "question_text": "Calculate 4,002 - 1,375.",
     "correct_answer": "2627",
     "_wrong_answers": {"place_value_gaps": "3737", "rushed_careless": "2617"}},
    {"id": 1104, "topic": "Arithmetic", "sub_topic": "Multiplication", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Calculate 28 x 7.",
     "correct_answer": "196",
     "_wrong_answers": {"rushed_careless": "186"}},
    {"id": 1105, "topic": "Arithmetic", "sub_topic": "Division", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Calculate 144 / 6.",
     "correct_answer": "24",
     "_wrong_answers": {"rushed_careless": "26"}},
    {"id": 1106, "topic": "Arithmetic", "sub_topic": "Multiplication", "tag": None,
     "difficulty": "HARD",
     "question_text": "Calculate 156 x 24.",
     "correct_answer": "3744",
     "_wrong_answers": {"rushed_careless": "3724", "place_value_gaps": "374"}},

    # ---- Negative Numbers ----
    {"id": 1201, "topic": "Negative Numbers", "sub_topic": "Addition", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is -5 + 8?",
     "correct_answer": "3"},
    {"id": 1202, "topic": "Negative Numbers", "sub_topic": "Subtraction", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is 4 - 9?",
     "correct_answer": "-5"},
    {"id": 1203, "topic": "Negative Numbers", "sub_topic": "Mixed", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is -7 - (-3)?",
     "correct_answer": "-4"},

    # ---- BIDMAS ----
    {"id": 1301, "topic": "BIDMAS", "sub_topic": None, "tag": None,
     "difficulty": "EASY",
     "question_text": "Calculate 5 + 3 x 2.",
     "correct_answer": "11"},
    {"id": 1302, "topic": "BIDMAS", "sub_topic": None, "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Calculate (8 - 3) x 4 + 2.",
     "correct_answer": "22"},
    {"id": 1303, "topic": "BIDMAS", "sub_topic": None, "tag": None,
     "difficulty": "HARD",
     "question_text": "Calculate 24 / (2 + 4) + 3 x 5.",
     "correct_answer": "19",
     "_wrong_answers": {"rushed_careless": "17"}},

    # ---- Fractions: Equivalent ----
    {"id": 1401, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
     "difficulty": "EASY",
     "question_text": "Which fraction is equivalent to 2/4?",
     "correct_answer": "1/2"},
    {"id": 1402, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Simplify 12/18 to its lowest terms.",
     "correct_answer": "2/3"},
    {"id": 1403, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
     "difficulty": "HARD",
     "question_text": "Which is larger: 3/5 or 5/8? Give your answer.",
     "correct_answer": "5/8"},

    # ---- Fractions: Add/Subtract  (CORE for fraction_inversion persona) ----
    {"id": 1411, "topic": "Fractions", "sub_topic": "Add", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is 1/2 + 1/3?",
     "correct_answer": "5/6",
     "_wrong_answers": {"fraction_inversion": "2/5"}},
    {"id": 1412, "topic": "Fractions", "sub_topic": "Add", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is 1/4 + 1/2?",
     "correct_answer": "3/4",
     "_wrong_answers": {"fraction_inversion": "2/6"}},
    {"id": 1413, "topic": "Fractions", "sub_topic": "Add", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is 2/5 + 1/3?",
     "correct_answer": "11/15",
     "_wrong_answers": {"fraction_inversion": "3/8"}},
    {"id": 1414, "topic": "Fractions", "sub_topic": "Add", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is 3/4 + 1/6?",
     "correct_answer": "11/12",
     "_wrong_answers": {"fraction_inversion": "4/10"}},
    {"id": 1415, "topic": "Fractions", "sub_topic": "Subtract", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is 5/6 - 1/3?",
     "correct_answer": "1/2",
     "_wrong_answers": {"fraction_inversion": "4/3"}},
    {"id": 1416, "topic": "Fractions", "sub_topic": "Add", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is 7/12 + 5/8?",
     "correct_answer": "29/24",
     "_wrong_answers": {"fraction_inversion": "12/20"}},

    # ---- Fractions: Multiply ----
    {"id": 1421, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is 1/2 x 1/3?",
     "correct_answer": "1/6",
     "_wrong_answers": {"fraction_inversion": "2/3"}},
    {"id": 1422, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is 2/3 x 3/4?",
     "correct_answer": "1/2",
     "_wrong_answers": {"fraction_inversion": "5/7"}},
    {"id": 1423, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is 4/5 of 35?",
     "correct_answer": "28",
     "_wrong_answers": {"fraction_inversion": "20"}},

    # ---- Algebra: Simple Equations ----
    {"id": 1501, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
     "difficulty": "EASY",
     "question_text": "Solve x + 7 = 12.",
     "correct_answer": "5"},
    {"id": 1502, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Solve 3x - 4 = 17.",
     "correct_answer": "7",
     "_wrong_answers": {"rushed_careless": "8"}},
    {"id": 1503, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
     "difficulty": "HARD",
     "question_text": "Solve 2(x + 3) = 18.",
     "correct_answer": "6"},

    # ---- Algebra: Sequences ----
    {"id": 1511, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is the next term: 2, 5, 8, 11, ___ ?",
     "correct_answer": "14"},
    {"id": 1512, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Find the 10th term of the sequence 4, 7, 10, 13, ...",
     "correct_answer": "31"},
    {"id": 1513, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is the nth term of the sequence 5, 8, 11, 14, ...?",
     "correct_answer": "3n+2"},

    # ---- Geometry: Area & Perimeter ----
    {"id": 1601, "topic": "Geometry", "sub_topic": "Perimeter", "tag": None,
     "difficulty": "EASY",
     "question_text": "What is the perimeter of a square with side length 7 cm?",
     "correct_answer": "28"},
    {"id": 1602, "topic": "Geometry", "sub_topic": "Area", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "What is the area of a rectangle 8 cm by 5 cm?",
     "correct_answer": "40"},
    {"id": 1603, "topic": "Geometry", "sub_topic": "Area", "tag": None,
     "difficulty": "HARD",
     "question_text": "A right-angled triangle has base 6 cm and height 9 cm. What is its area?",
     "correct_answer": "27"},

    # ---- Geometry: Angles ----
    {"id": 1611, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
     "difficulty": "EASY",
     "question_text": "Two angles on a straight line are 110° and x. What is x?",
     "correct_answer": "70"},
    {"id": 1612, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "The angles of a triangle are 45°, 60° and x. What is x?",
     "correct_answer": "75"},
    {"id": 1613, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
     "difficulty": "HARD",
     "question_text": "What is the sum of interior angles of a hexagon?",
     "correct_answer": "720"},

    # ---- Data: Mean / Median / Mode ----
    {"id": 1701, "topic": "Data", "sub_topic": "Mean", "tag": None,
     "difficulty": "EASY",
     "question_text": "Find the mean of 4, 6, 8, 10, 12.",
     "correct_answer": "8"},
    {"id": 1702, "topic": "Data", "sub_topic": "Median", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "Find the median of 3, 7, 2, 9, 5.",
     "correct_answer": "5"},
    {"id": 1703, "topic": "Data", "sub_topic": "Mode", "tag": None,
     "difficulty": "EASY",
     "question_text": "Find the mode of 2, 3, 3, 5, 7, 7, 7, 8.",
     "correct_answer": "7"},

    # ---- Data: Probability ----
    {"id": 1711, "topic": "Data", "sub_topic": "Probability", "tag": None,
     "difficulty": "MEDIUM",
     "question_text": "A bag has 3 red and 5 blue marbles. What is the probability of red?",
     "correct_answer": "3/8"},
    {"id": 1712, "topic": "Data", "sub_topic": "Probability", "tag": None,
     "difficulty": "HARD",
     "question_text": "A fair die is rolled. What is the probability of an even number greater than 2?",
     "correct_answer": "1/3"},

    # ---- Word problems (cross-topic) ----
    {"id": 1801, "topic": "Arithmetic", "sub_topic": "Word problem", "tag": "word_problem",
     "difficulty": "EASY",
     "question_text": "Tom has 24 apples. He gives 9 to his friend. How many does he have left?",
     "correct_answer": "15"},
    {"id": 1802, "topic": "Fractions", "sub_topic": "Word problem", "tag": "word_problem",
     "difficulty": "MEDIUM",
     "question_text": "A pizza is cut into 8 slices. Sara eats 1/4 and Tom eats 3/8. What fraction is left?",
     "correct_answer": "3/8",
     "_wrong_answers": {"word_problem_weak": "1/2", "fraction_inversion": "4/12"}},
    {"id": 1803, "topic": "Arithmetic", "sub_topic": "Word problem", "tag": "word_problem",
     "difficulty": "MEDIUM",
     "question_text": "A train ticket costs £8.50. How much do 6 tickets cost?",
     "correct_answer": "51",
     "_wrong_answers": {"word_problem_weak": "48", "rushed_careless": "50"}},
    {"id": 1804, "topic": "Algebra", "sub_topic": "Word problem", "tag": "word_problem",
     "difficulty": "HARD",
     "question_text": "Three consecutive numbers add up to 72. What is the smallest number?",
     "correct_answer": "23",
     "_wrong_answers": {"word_problem_weak": "24"}},
    {"id": 1805, "topic": "Geometry", "sub_topic": "Word problem", "tag": "word_problem",
     "difficulty": "HARD",
     "question_text": "A rectangular garden is 12 m long and 4 m shorter than it is long. What is its area?",
     "correct_answer": "96",
     "_wrong_answers": {"word_problem_weak": "48"}},
]

# Fill in remaining standard fields for every bank entry.
for q in QUESTION_BANK:
    q.setdefault("category", "Math")
    q.setdefault("year_level", "Year 6")
    q.setdefault("source", "BOOST")
    q.setdefault("source_description", None)
    q.setdefault("teacher_id", TUTOR["id"])
    q.setdefault("maximum_marks", 1)
    q.setdefault("rubric", None)
    q.setdefault("step_by_step_solution", None)
    q.setdefault("image_url", None)
    q.setdefault("is_deleted", False)
    q.setdefault("created_at", ms(days_ago(40)))
    q.setdefault("updated_at", ms(days_ago(40)))
    q.setdefault("_wrong_answers", {})


# ---------------------------------------------------------------------------
# Assignments + Assignment Questions
#
# 8 assignments. Distribution: 5 CLOSED (past), 2 PUBLISHED (in-flight),
# 1 DRAFT (future). The deadline-pressure assignment is a Fractions/Add
# assignment due in 5 days — drives the bonus Early Warning topic correlation.
# ---------------------------------------------------------------------------

ASSIGNMENT_DEFS = [
    # (id, name, focus_topic, due_offset_days, status, question_bank_ids)
    (3001, "HW1 — Place Value Warmup",       "Place Value",   -28, "CLOSED",
     [1001, 1002, 1003, 1101, 1102, 1301, 1401, 1701]),
    (3002, "HW2 — Arithmetic Practice",      "Arithmetic",    -22, "CLOSED",
     [1101, 1102, 1103, 1104, 1105, 1106, 1801, 1803]),
    (3003, "HW3 — Fractions Foundations",    "Fractions",     -16, "CLOSED",
     [1401, 1402, 1411, 1412, 1413, 1421, 1422, 1802]),
    (3004, "HW4 — Negatives & BIDMAS",       "BIDMAS",        -10, "CLOSED",
     [1201, 1202, 1203, 1301, 1302, 1303, 1502, 1701]),
    (3005, "HW5 — Geometry Basics",          "Geometry",      -6,  "CLOSED",
     [1601, 1602, 1603, 1611, 1612, 1613, 1804, 1805]),
    (3006, "HW6 — Algebra & Sequences",      "Algebra",        2,  "PUBLISHED",
     [1501, 1502, 1503, 1511, 1512, 1513, 1804, 1702]),
    # The deadline-pressure assignment — bonus Early Warning anchors here.
    (3007, "HW7 — Adding Fractions (test prep)", "Fractions",  5,  "PUBLISHED",
     [1411, 1412, 1413, 1414, 1415, 1416, 1802, 1422]),
    (3008, "HW8 — Mixed Revision",           "Mixed",         12,  "DRAFT",
     [1004, 1106, 1303, 1416, 1503, 1613, 1712, 1805]),
]

ASSIGNMENTS = []
ASSIGNMENT_QUESTIONS = []
for aid, name, topic, offset, status, qb_ids in ASSIGNMENT_DEFS:
    created_offset = max(offset - 7, -45)  # created ~1 week before due
    ASSIGNMENTS.append({
        "id": aid,
        "name": name,
        "teacher_id": TUTOR["id"],
        "topic": topic,
        "due_date": ms(days_ago(-offset, hour=23, minute=59)),
        "status": status,
        "maximum_marks": len(qb_ids),
        "is_deleted": False,
        "created_at": ms(days_ago(-created_offset)),
        "updated_at": ms(days_ago(-created_offset)),
    })
    for order, qb_id in enumerate(qb_ids, start=1):
        ASSIGNMENT_QUESTIONS.append({
            "id": aid * 100 + order,
            "assignment_id": aid,
            "question_bank_id": qb_id,
            "question_order": order,
            "maximum_marks": 1,
            "rubric": None,
            "created_at": ms(days_ago(-created_offset)),
        })


# ---------------------------------------------------------------------------
# Assignment Assignees (per student × per assignment) + Student Answers
# ---------------------------------------------------------------------------

QB_BY_ID = {q["id"]: q for q in QUESTION_BANK}
AQ_BY_ID = {aq["id"]: aq for aq in ASSIGNMENT_QUESTIONS}


def assignee_status_for(assignment_status: str, persona: str, aid: int) -> str:
    if assignment_status == "DRAFT":
        return "NOT_STARTED"
    if assignment_status == "PUBLISHED":
        # Some students have started it, some not.
        return "IN_PROGRESS"
    return "SUBMITTED"


# --- Persona-driven correctness/solve-mode generation -----------------------
#
# Each persona is a function that, given the assignment, the question, and
# the assignment's "week index" (0 = oldest, higher = more recent), returns:
#   (is_correct, solve_mode, time_seconds, answer_text)
# All randomness flows through RNG (seeded), so output is deterministic.

SOLVE_MODES = ["just_answer", "step_by_step", "solve_together", "handwritten"]


def base_time_for_difficulty(d: str) -> int:
    return {"EASY": 60, "MEDIUM": 100, "HARD": 160}[d]


def jitter_time(base: int) -> int:
    return max(15, int(base + RNG.randint(-25, 35)))


def pick_mode_default(persona: str, week_idx: int) -> str:
    # Most students mostly use just_answer; occasionally step_by_step;
    # rarely solve_together.
    r = RNG.random()
    if r < 0.70:
        return "just_answer"
    if r < 0.90:
        return "step_by_step"
    if r < 0.97:
        return "solve_together"
    return "handwritten"


def answer_for_persona(q: dict, persona: str, force_correct: bool) -> tuple[bool, str]:
    """Return (is_correct, answer_text)."""
    if force_correct:
        return True, q["correct_answer"]
    wrong_map = q.get("_wrong_answers", {}) or {}
    if persona in wrong_map:
        return False, wrong_map[persona]
    # Generic wrong answer.
    return False, q["correct_answer"] + "?"


def gen_answer(student: dict, assignment: dict, aq: dict, q: dict, week_idx: int, total_weeks: int):
    """Return a student_answers row dict (or None if assignee hasn't attempted it)."""
    persona = student["_persona"]
    difficulty = q["difficulty"]
    base_time = base_time_for_difficulty(difficulty)
    is_word = q.get("tag") == "word_problem"
    is_fraction_op = q["topic"] == "Fractions" and q["sub_topic"] in ("Add", "Subtract", "Multiply")
    is_place_value = q["topic"] == "Place Value" or (q["topic"] == "Arithmetic" and q["sub_topic"] in ("Subtraction", "Multiplication") and difficulty == "HARD")

    # Default: stable_mid baseline.
    p_correct = 0.65
    solve_mode = pick_mode_default(persona, week_idx)
    answered_at_offset_days = 0  # set below
    misconception_tag = None

    if persona == "fraction_inversion":
        if is_fraction_op:
            # Sharp misconception: very low on fraction ops, declining.
            p_correct = max(0.03, 0.20 - 0.03 * week_idx)
            misconception_candidate = "add_tops_add_bottoms" if q["sub_topic"] in ("Add", "Subtract") else "fraction_op_confusion"
        elif q["topic"] == "Fractions":
            # Equivalent fractions etc: still shaky.
            p_correct = 0.25
            misconception_candidate = "fraction_general_uncertainty"
        else:
            p_correct = 0.78
            misconception_candidate = None
    elif persona == "place_value_gaps":
        if is_place_value or (q["topic"] == "Place Value"):
            p_correct = 0.25
            misconception_candidate = "place_value_misalignment"
        else:
            p_correct = 0.65
            misconception_candidate = None
    elif persona == "rushed_careless":
        # Right method when forced to slow down (step_by_step), wrong when rushed.
        # In just_answer: 40% correct. In step_by_step: 90% correct.
        # Time-on-task drops over time (rushing more).
        # Solve mode mostly just_answer.
        r = RNG.random()
        solve_mode = "just_answer" if r < 0.85 else "step_by_step"
        if solve_mode == "step_by_step":
            p_correct = 0.90
        else:
            p_correct = 0.40
        misconception_candidate = "arithmetic_slip"
        # Time decays: week 0 = 0.9 * base, latest = 0.4 * base
        t_factor = max(0.4, 0.9 - 0.12 * week_idx)
        base_time = int(base_time * t_factor)
    elif persona == "solve_together_dependent":
        # solve_together usage rises sharply over time. Independent
        # accuracy is low and degrading — student is leaning on scaffolding
        # more and more.
        st_prob = 0.08 + 0.18 * week_idx  # week 0 ~8%, week 5 ~98%
        st_prob = min(0.92, st_prob)
        r = RNG.random()
        if r < st_prob:
            solve_mode = "solve_together"
            p_correct = 0.85
        else:
            solve_mode = "just_answer" if RNG.random() < 0.7 else "step_by_step"
            p_correct = max(0.20, 0.55 - 0.06 * week_idx)
        misconception_candidate = "scaffolding_dependence"
    elif persona == "word_problem_weak":
        if is_word:
            p_correct = 0.20
            misconception_candidate = "word_problem_interpretation"
        else:
            p_correct = 0.78
            misconception_candidate = None
    elif persona == "stable_strong":
        p_correct = 0.88 if difficulty != "HARD" else 0.78
        misconception_candidate = None
    elif persona == "stable_mid":
        p_correct = 0.65 if difficulty != "HARD" else 0.50
        misconception_candidate = None
    elif persona == "stable_weak":
        p_correct = 0.55 if difficulty != "HARD" else 0.40
        misconception_candidate = None
    else:
        misconception_candidate = None

    # Decide attempted-or-not for IN_PROGRESS assignments.
    is_correct = RNG.random() < p_correct
    is_correct, answer_text = answer_for_persona(q, persona, force_correct=is_correct)
    if not is_correct:
        misconception_tag = misconception_candidate

    return {
        "is_correct": is_correct,
        "answer_text": answer_text,
        "solve_mode": solve_mode,
        "time_on_task_seconds": jitter_time(base_time),
        "misconception_tag": misconception_tag,
    }


def derive_working_steps(q: dict, ans: dict):
    solve_mode = ans["solve_mode"]
    correct = bool(ans["is_correct"])
    topic = q["topic"]
    sub_topic = q.get("sub_topic") or topic
    expected = q["correct_answer"]
    given = ans["answer_text"]
    misconception_tag = ans.get("misconception_tag")

    if solve_mode == "just_answer":
        return "I solved it mentally and wrote the final answer only."

    if solve_mode == "solve_together":
        if correct:
            return f"I followed guided steps for {topic.lower()} and reached {given}."
        return f"I needed guided help on {sub_topic.lower()}, but I still ended with {given} instead of {expected}."

    if solve_mode == "handwritten":
        if correct:
            return f"I worked it out on paper using {sub_topic.lower()} and checked that the final answer was {given}."
        return f"I worked it out on paper for {sub_topic.lower()}, but my final answer was {given} instead of {expected}."

    if correct:
        return f"I used a step-by-step method for {sub_topic.lower()} and got the correct answer {given}."

    misconception_explanations = {
        "add_tops_add_bottoms": "I added the numerators and denominators directly instead of finding a common denominator.",
        "fraction_op_confusion": "I mixed up the fraction rule and used the wrong operation method.",
        "fraction_general_uncertainty": "I was unsure which fraction method to use, so my working was inconsistent.",
        "place_value_misalignment": "I lined the digits up incorrectly and misread the place value.",
        "arithmetic_slip": "My method was close, but I made a careless arithmetic slip in the calculation.",
        "scaffolding_dependence": "I could start the method, but I was not secure enough to finish it independently.",
        "word_problem_interpretation": "I misunderstood what the word problem was asking me to calculate.",
    }

    explanation = misconception_explanations.get(
        misconception_tag,
        f"My method did not lead to the expected answer {expected}.",
    )
    return f"I tried a step-by-step method for {sub_topic.lower()}, but I got {given}. {explanation}"


def derive_review_fields(q: dict, ans: dict):
    correct = bool(ans["is_correct"])
    solve_mode = ans["solve_mode"]
    misconception_tag = ans["misconception_tag"]
    expected = q["correct_answer"]

    if correct:
        understanding_score = {
            "step_by_step": 0.95,
            "handwritten": 0.85,
            "just_answer": 0.75,
            "solve_together": 0.65,
        }.get(solve_mode, 0.75)
        confidence = {
            "step_by_step": 0.82,
            "handwritten": 0.78,
            "just_answer": 0.9,
            "solve_together": 0.62,
        }.get(solve_mode, 0.78)
        needs_attention = understanding_score < 0.72
        issue_reason = (
            "Correct answer, but there is limited evidence that the method is secure."
            if needs_attention else None
        )
        ai_feedback_review = (
            "Correct answer with clear method evidence."
            if solve_mode == "step_by_step"
            else "Correct answer, but understanding evidence is lighter because the response is brief."
            if needs_attention
            else "Correct answer with secure understanding shown."
        )
    else:
        understanding_score = {
            "step_by_step": 0.4,
            "handwritten": 0.32,
            "just_answer": 0.2,
            "solve_together": 0.28,
        }.get(solve_mode, 0.25)
        confidence = {
            "step_by_step": 0.55,
            "handwritten": 0.6,
            "just_answer": 0.72,
            "solve_together": 0.5,
        }.get(solve_mode, 0.6)
        needs_attention = True
        issue_reason = {
            "add_tops_add_bottoms": "The student added the numerator and denominator directly instead of finding a common denominator.",
            "fraction_op_confusion": "The student confused the fraction operation and did not apply the correct method.",
            "fraction_general_uncertainty": "The student shows insecure understanding of equivalent or comparable fractions.",
            "place_value_misalignment": "The student misread place value, causing digits to be aligned incorrectly.",
            "arithmetic_slip": "The final answer is wrong, suggesting a careless arithmetic slip rather than a secure method.",
            "scaffolding_dependence": "The student appears dependent on scaffolding and does not show secure independent understanding.",
            "word_problem_interpretation": "The student did not translate the word problem into the correct calculation.",
        }.get(
            misconception_tag,
            f"The answer does not match the correct answer ({expected}), showing an incomplete understanding of the method.",
        )
        ai_feedback_review = f"Incorrect answer compared with the expected answer {expected}. {issue_reason}"

    return {
        "review_correctness_score": 1.0,
        "review_question_score": 1.0,
        "review_understanding_score": round(understanding_score, 3),
        "review_confidence": round(confidence, 3),
        "review_needs_attention": needs_attention,
        "review_issue_reason": issue_reason,
        "review_tags": [misconception_tag] if misconception_tag else [],
        "ai_feedback_review": ai_feedback_review,
    }


def build_assignment_review_summary(student_name: str, assignment_name: str, reviews: list[dict]):
    if not reviews:
        return None, "No submitted work is available for this assignment yet.", None

    per_question_scores = []
    weak_topics: dict[str, list[float]] = {}
    attention_count = 0
    correct_count = 0

    for review in reviews:
        correctness_value = 1.0 if review["is_correct"] else 0.0
        understanding_value = review["review_understanding_score"]
        question_score = (correctness_value + understanding_value) / 2
        per_question_scores.append(question_score)
        weak_topics.setdefault(review["topic"], []).append(question_score)
        if review["review_needs_attention"]:
            attention_count += 1
        if review["is_correct"]:
            correct_count += 1

    overall_score = round(sum(per_question_scores) / len(per_question_scores) * 10, 2)
    weakest_topics = sorted(
        ((topic, sum(values) / len(values)) for topic, values in weak_topics.items()),
        key=lambda item: (item[1], item[0]),
    )[:2]
    weakest_topic_text = ", ".join(topic for topic, _ in weakest_topics) if weakest_topics else "general fluency"
    next_step_outcome = "accept" if overall_score >= 6.0 else "support" if overall_score >= 4.5 else "redo"
    ai_feedback = (
        f"{student_name} completed {assignment_name} with {correct_count}/{len(reviews)} correct responses. "
        f"Overall score is {overall_score}/10. "
        f"The weakest areas were {weakest_topic_text}. "
        f"{attention_count} question(s) need extra attention."
    )

    return overall_score, ai_feedback, next_step_outcome


# Order assignments oldest -> newest for week_idx threading.
ASSIGNMENT_DEFS_SORTED = sorted(ASSIGNMENT_DEFS, key=lambda a: a[3])  # by due_offset
weeks_total = len(ASSIGNMENT_DEFS_SORTED)

ASSIGNMENT_ASSIGNEES = []
STUDENT_ANSWERS = []
ACTIVITY_LOGS = []

assignee_id_seq = 4000
answer_id_seq = 50000
log_id_seq = 70000

# Define which students "fall behind" on attempts (bonus signal).
LOW_ATTEMPT_RATE_STUDENTS = {203}  # Chen Wei: persona=rushed_careless + 8 days no activity
SKIP_RECENT_ASSIGNMENTS_STUDENTS = {203}  # student 3 hasn't attempted recent assignments


for week_idx, assignment_def in enumerate(ASSIGNMENT_DEFS_SORTED):
    aid, name, topic, offset, status, qb_ids = assignment_def
    a_questions = [aq for aq in ASSIGNMENT_QUESTIONS if aq["assignment_id"] == aid]

    for student in STUDENTS:
        sid = student["id"]
        persona = student["_persona"]
        # Skip recent assignments for students with low attempt rate
        if status != "CLOSED" and sid in SKIP_RECENT_ASSIGNMENTS_STUDENTS:
            assignee = {
                "id": assignee_id_seq,
                "assignment_id": aid,
                "student_id": sid,
                "classroom_id": CLASSROOM["id"],
                "status": "NOT_STARTED",
                "started_at": None,
                "submitted_at": None,
                "total_marks": None,
                "is_active": True,
                "deactivated_at": None,
                "created_at": ms(days_ago(-(offset - 7))),
            }
            ASSIGNMENT_ASSIGNEES.append(assignee)
            assignee_id_seq += 1
            continue

        # Skip last published assignment for some low-attempt students
        if sid in LOW_ATTEMPT_RATE_STUDENTS and status != "CLOSED":
            assignee = {
                "id": assignee_id_seq,
                "assignment_id": aid,
                "student_id": sid,
                "classroom_id": CLASSROOM["id"],
                "status": "NOT_STARTED",
                "started_at": None,
                "submitted_at": None,
                "total_marks": None,
                "is_active": True,
                "deactivated_at": None,
                "created_at": ms(days_ago(-(offset - 7))),
            }
            ASSIGNMENT_ASSIGNEES.append(assignee)
            assignee_id_seq += 1
            continue

        # In-progress: ~70% have started and answered ~half the questions
        if status == "IN_PROGRESS" or status == "PUBLISHED":
            started = RNG.random() < 0.85
            submitted = False
            if not started:
                assignee = {
                    "id": assignee_id_seq,
                    "assignment_id": aid,
                    "student_id": sid,
                    "classroom_id": CLASSROOM["id"],
                    "status": "NOT_STARTED",
                    "started_at": None,
                    "submitted_at": None,
                    "total_marks": None,
                    "is_active": True,
                    "deactivated_at": None,
                    "created_at": ms(days_ago(-(offset - 7))),
                }
                ASSIGNMENT_ASSIGNEES.append(assignee)
                assignee_id_seq += 1
                continue
        else:
            started = True
            submitted = True

        # Date the attempts: roughly the day after the assignment was set
        attempt_day_offset = max(offset - 5, -42)
        started_at = days_ago(-attempt_day_offset, hour=16 + RNG.randint(0, 3), minute=RNG.randint(0, 59))

        # For DRAFT, no assignees needed (skip)
        if status == "DRAFT":
            continue

        assignee_id = assignee_id_seq
        assignee_id_seq += 1

        # Generate answers
        total_score = 0
        questions_to_answer = a_questions
        if status in ("IN_PROGRESS", "PUBLISHED") and not submitted:
            # Partial completion
            n_done = RNG.randint(max(1, len(a_questions) // 2), len(a_questions))
            questions_to_answer = a_questions[:n_done]

        running_time_offset = 0
        answer_reviews = []
        for aq in questions_to_answer:
            q = QB_BY_ID[aq["question_bank_id"]]
            ans = gen_answer(student, assignment_def, aq, q, week_idx, weeks_total)
            working_steps = derive_working_steps(q, ans)
            review = derive_review_fields(q, ans)
            answered_at = started_at + timedelta(seconds=running_time_offset + ans["time_on_task_seconds"])
            running_time_offset += ans["time_on_task_seconds"] + RNG.randint(5, 30)

            STUDENT_ANSWERS.append({
                "id": answer_id_seq,
                "assignee_id": assignee_id,
                "assignment_question_id": aq["id"],
                "answer_type": "LATEX",
                "answer_latex": ans["answer_text"],
                "extracted_answer": ans["answer_text"],
                "solve_mode": ans["solve_mode"],
                "working_steps": working_steps,
                "graded_marks": 1 if ans["is_correct"] else 0,
                "marks_awarded": 1 if ans["is_correct"] else 0,
                "ai_reasoning": (
                    "Answer matches expected solution." if ans["is_correct"]
                    else f"Incorrect; expected {q['correct_answer']}."
                ),
                "is_correct": ans["is_correct"],
                "ai_feedback": review["ai_feedback_review"],
                "review_needs_attention": review["review_needs_attention"],
                "review_issue_reason": review["review_issue_reason"],
                "review_correctness_score": review["review_correctness_score"],
                "review_understanding_score": review["review_understanding_score"],
                "review_question_score": review["review_question_score"],
                "review_confidence": review["review_confidence"],
                "review_tags": review["review_tags"],
                "grading_status": "GRADED",
                "grading_attempts": 1,
                "is_active": True,
                "created_at": ms(answered_at),
                # ---- Hackathon annotations (not in production schema) ----
                "_solve_mode": ans["solve_mode"],
                "_time_on_task_seconds": ans["time_on_task_seconds"],
                "_is_correct": ans["is_correct"],
                "_misconception_tag": ans["misconception_tag"],
                "_question_topic": q["topic"],
                "_question_sub_topic": q["sub_topic"],
                "_question_difficulty": q["difficulty"],
                "_answered_at": ms(answered_at),
            })
            answer_reviews.append({
                "topic": q["topic"],
                "is_correct": ans["is_correct"],
                "review_understanding_score": review["review_understanding_score"],
                "review_needs_attention": review["review_needs_attention"],
            })
            answer_id_seq += 1
            total_score += (1 if ans["is_correct"] else 0)

            # Activity log
            ACTIVITY_LOGS.append({
                "id": log_id_seq,
                "assignee_id": assignee_id,
                "assignment_question_id": aq["id"],
                "activity_type": "ANSWERED",
                "timestamp": ms(answered_at),
                "duration_seconds": ans["time_on_task_seconds"],
                "extra_data": {"solve_mode": ans["solve_mode"]},
                "created_at": ms(answered_at),
                "_student_id": sid,
            })
            log_id_seq += 1

        submitted_at = started_at + timedelta(seconds=running_time_offset) if submitted else None
        overall_score, assignee_ai_feedback, next_step_outcome = build_assignment_review_summary(
            student["fullname"],
            name,
            answer_reviews,
        )

        assignee = {
            "id": assignee_id,
            "assignment_id": aid,
            "student_id": sid,
            "classroom_id": CLASSROOM["id"],
            "status": "SUBMITTED" if submitted else "IN_PROGRESS",
            "started_at": ms(started_at),
            "submitted_at": ms(submitted_at) if submitted_at else None,
            "total_marks": total_score if submitted else None,
            "overall_score": overall_score if submitted else None,
            "ai_feedback": assignee_ai_feedback if submitted else None,
            "next_step_outcome": next_step_outcome if submitted else None,
            "is_active": True,
            "deactivated_at": None,
            "created_at": ms(days_ago(-(offset - 7))),
        }
        ASSIGNMENT_ASSIGNEES.append(assignee)


# ---------------------------------------------------------------------------
# Write outputs
# ---------------------------------------------------------------------------

def write_json(name: str, data) -> None:
    path = OUT_DIR / name
    with path.open("w") as f:
        json.dump(data, f, indent=2, default=str)
    print(f"  wrote {name}  ({len(data) if isinstance(data, list) else 'object'} records)")


print("Generating mock dataset...")
write_json("classroom.json", {
    "classroom": CLASSROOM,
    "tutor": TUTOR,
    "classroom_student_rs": CLASSROOM_STUDENT_RS,
})
write_json("students.json", STUDENTS)
write_json("question_bank.json", QUESTION_BANK)
write_json("assignments.json", ASSIGNMENTS)
write_json("assignment_questions.json", ASSIGNMENT_QUESTIONS)
write_json("assignment_assignees.json", ASSIGNMENT_ASSIGNEES)
write_json("student_answers.json", STUDENT_ANSWERS)
write_json("activity_logs.json", ACTIVITY_LOGS)

dataset = {
    "_meta": {
        "generated_at_utc": TODAY.isoformat(),
        "reference_today": TODAY.date().isoformat(),
        "schema_source": "elevenplus-backend/src/app/models/",
        "subject": "Maths (UK 11+)",
        "students": len(STUDENTS),
        "assignments": len(ASSIGNMENTS),
        "questions_in_bank": len(QUESTION_BANK),
        "student_answers": len(STUDENT_ANSWERS),
        "expected_top_3_at_risk_student_ids": [201, 203, 204],
    },
    "classroom": CLASSROOM,
    "tutor": TUTOR,
    "classroom_student_rs": CLASSROOM_STUDENT_RS,
    "students": STUDENTS,
    "question_bank": QUESTION_BANK,
    "assignments": ASSIGNMENTS,
    "assignment_questions": ASSIGNMENT_QUESTIONS,
    "assignment_assignees": ASSIGNMENT_ASSIGNEES,
    "student_answers": STUDENT_ANSWERS,
    "activity_logs": ACTIVITY_LOGS,
}
write_json("dataset.json", dataset)
print("Done.")