Files
BoostAI/Mock-Data/generate.py
2026-05-25 17:05:06 +01:00

1009 lines
42 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Mock pupil-history dataset generator for the BoostAI "Learning Path Agent"
hackathon challenge.
Outputs JSON files under the same directory, mirroring the production
SQLAlchemy schema in elevenplus-backend/src/app/models/. The generator is
deterministic (seeded RNG) so re-runs produce identical output.
Run:
python3 generate.py
Outputs:
classroom.json, students.json, question_bank.json, assignments.json,
assignment_questions.json, assignment_assignees.json,
student_answers.json, activity_logs.json, dataset.json
"""
from __future__ import annotations
import json
import os
import random
from datetime import datetime, timedelta, timezone
from pathlib import Path
# Reference date — keep in sync with the hackathon brief.
TODAY = datetime(2026, 5, 1, 9, 0, 0, tzinfo=timezone.utc)
OUT_DIR = Path(__file__).parent
RNG = random.Random(20260501)
def ms(dt: datetime) -> int:
return int(dt.timestamp() * 1000)
def days_ago(n: float, hour: int = 10, minute: int = 0) -> datetime:
return (TODAY - timedelta(days=n)).replace(hour=hour, minute=minute, second=0, microsecond=0)
# ---------------------------------------------------------------------------
# Classroom + Tutor + Students
# ---------------------------------------------------------------------------
TUTOR = {
"id": 100,
"fullname": "Sarah Johnson",
"email": "sarah.johnson@boostai.example",
"username": "sjohnson",
"role": "tutor",
"active": True,
"is_test": False,
"is_deleted": False,
"created_at": ms(days_ago(180)),
"updated_at": ms(days_ago(180)),
}
CLASSROOM = {
"id": 500,
"name": "Year 6 — Maths Set 1",
"organization_id": 1,
"tutor_id": TUTOR["id"],
"invite_code": "Y6MATHS1",
"target_level": 6,
"archived": False,
"hide_just_answer": False,
"is_deleted": False,
"created_at": ms(days_ago(60)),
"updated_at": ms(days_ago(60)),
}
# 12 students. _persona is a hackathon annotation (not in the production
# schema) — used to drive answer generation below and to document expected
# agent output. Strip the underscore-prefixed fields if seeding the real DB.
STUDENTS_RAW = [
(201, "Aisha Khan", "fraction_inversion"),
(202, "Ben Carter", "place_value_gaps"),
(203, "Chen Wei", "rushed_careless"),
(204, "Daniela Rossi", "solve_together_dependent"),
(205, "Elif Demir", "word_problem_weak"),
(206, "Felix Brown", "stable_strong"),
(207, "Grace Park", "stable_strong"),
(208, "Harry Singh", "stable_mid"),
(209, "Isla Nakamura", "stable_mid"),
(210, "Jaden Williams", "stable_mid"),
(211, "Kira Patel", "stable_weak"),
(212, "Liam O'Connor", "stable_weak"),
]
STUDENTS = []
CLASSROOM_STUDENT_RS = []
for sid, fullname, persona in STUDENTS_RAW:
first = fullname.split()[0].lower().replace("'", "")
STUDENTS.append({
"id": sid,
"fullname": fullname,
"email": f"{first}.{sid}@boostai.example",
"username": f"{first}{sid}",
"role": "student",
"active": True,
"is_test": False,
"is_deleted": False,
"created_at": ms(days_ago(55)),
"updated_at": ms(days_ago(55)),
"_persona": persona,
})
CLASSROOM_STUDENT_RS.append({
"id": 600 + sid,
"classroom_id": CLASSROOM["id"],
"student_id": sid,
"created_at": ms(days_ago(55)),
})
# ---------------------------------------------------------------------------
# Question Bank — Maths, 11+
# Fields mirror question_bank.py.
# _wrong_answer_map: hackathon helper — for misconception personas, the
# answer they typically produce. Not in production schema.
# ---------------------------------------------------------------------------
QUESTION_BANK = [
# ---- Place Value (tens, hundreds, thousands, decimals) ----
{"id": 1001, "topic": "Place Value", "sub_topic": "Multi-digit numbers", "tag": None,
"difficulty": "EASY",
"question_text": "What is the value of the digit 7 in the number 4,732?",
"correct_answer": "700",
"_wrong_answers": {"place_value_gaps": "70"}},
{"id": 1002, "topic": "Place Value", "sub_topic": "Multi-digit numbers", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Round 24,587 to the nearest thousand.",
"correct_answer": "25000",
"_wrong_answers": {"place_value_gaps": "24000"}},
{"id": 1003, "topic": "Place Value", "sub_topic": "Decimals", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Write 0.07 as a fraction in its simplest form.",
"correct_answer": "7/100",
"_wrong_answers": {"place_value_gaps": "7/10"}},
{"id": 1004, "topic": "Place Value", "sub_topic": "Decimals", "tag": None,
"difficulty": "HARD",
"question_text": "What is 0.3 + 0.07?",
"correct_answer": "0.37",
"_wrong_answers": {"place_value_gaps": "0.10"}},
# ---- Arithmetic ----
{"id": 1101, "topic": "Arithmetic", "sub_topic": "Addition", "tag": None,
"difficulty": "EASY",
"question_text": "Calculate 246 + 137.",
"correct_answer": "383",
"_wrong_answers": {"rushed_careless": "373"}},
{"id": 1102, "topic": "Arithmetic", "sub_topic": "Subtraction", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Calculate 503 - 47.",
"correct_answer": "456",
"_wrong_answers": {"place_value_gaps": "544", "rushed_careless": "466"}},
{"id": 1103, "topic": "Arithmetic", "sub_topic": "Subtraction", "tag": None,
"difficulty": "HARD",
"question_text": "Calculate 4,002 - 1,375.",
"correct_answer": "2627",
"_wrong_answers": {"place_value_gaps": "3737", "rushed_careless": "2617"}},
{"id": 1104, "topic": "Arithmetic", "sub_topic": "Multiplication", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Calculate 28 x 7.",
"correct_answer": "196",
"_wrong_answers": {"rushed_careless": "186"}},
{"id": 1105, "topic": "Arithmetic", "sub_topic": "Division", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Calculate 144 / 6.",
"correct_answer": "24",
"_wrong_answers": {"rushed_careless": "26"}},
{"id": 1106, "topic": "Arithmetic", "sub_topic": "Multiplication", "tag": None,
"difficulty": "HARD",
"question_text": "Calculate 156 x 24.",
"correct_answer": "3744",
"_wrong_answers": {"rushed_careless": "3724", "place_value_gaps": "374"}},
# ---- Negative Numbers ----
{"id": 1201, "topic": "Negative Numbers", "sub_topic": "Addition", "tag": None,
"difficulty": "EASY",
"question_text": "What is -5 + 8?",
"correct_answer": "3"},
{"id": 1202, "topic": "Negative Numbers", "sub_topic": "Subtraction", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is 4 - 9?",
"correct_answer": "-5"},
{"id": 1203, "topic": "Negative Numbers", "sub_topic": "Mixed", "tag": None,
"difficulty": "HARD",
"question_text": "What is -7 - (-3)?",
"correct_answer": "-4"},
# ---- BIDMAS ----
{"id": 1301, "topic": "BIDMAS", "sub_topic": None, "tag": None,
"difficulty": "EASY",
"question_text": "Calculate 5 + 3 x 2.",
"correct_answer": "11"},
{"id": 1302, "topic": "BIDMAS", "sub_topic": None, "tag": None,
"difficulty": "MEDIUM",
"question_text": "Calculate (8 - 3) x 4 + 2.",
"correct_answer": "22"},
{"id": 1303, "topic": "BIDMAS", "sub_topic": None, "tag": None,
"difficulty": "HARD",
"question_text": "Calculate 24 / (2 + 4) + 3 x 5.",
"correct_answer": "19",
"_wrong_answers": {"rushed_careless": "17"}},
# ---- Fractions: Equivalent ----
{"id": 1401, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
"difficulty": "EASY",
"question_text": "Which fraction is equivalent to 2/4?",
"correct_answer": "1/2"},
{"id": 1402, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Simplify 12/18 to its lowest terms.",
"correct_answer": "2/3"},
{"id": 1403, "topic": "Fractions", "sub_topic": "Equivalent", "tag": None,
"difficulty": "HARD",
"question_text": "Which is larger: 3/5 or 5/8? Give your answer.",
"correct_answer": "5/8"},
# ---- Fractions: Add/Subtract (CORE for fraction_inversion persona) ----
{"id": 1411, "topic": "Fractions", "sub_topic": "Add", "tag": None,
"difficulty": "EASY",
"question_text": "What is 1/2 + 1/3?",
"correct_answer": "5/6",
"_wrong_answers": {"fraction_inversion": "2/5"}},
{"id": 1412, "topic": "Fractions", "sub_topic": "Add", "tag": None,
"difficulty": "EASY",
"question_text": "What is 1/4 + 1/2?",
"correct_answer": "3/4",
"_wrong_answers": {"fraction_inversion": "2/6"}},
{"id": 1413, "topic": "Fractions", "sub_topic": "Add", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is 2/5 + 1/3?",
"correct_answer": "11/15",
"_wrong_answers": {"fraction_inversion": "3/8"}},
{"id": 1414, "topic": "Fractions", "sub_topic": "Add", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is 3/4 + 1/6?",
"correct_answer": "11/12",
"_wrong_answers": {"fraction_inversion": "4/10"}},
{"id": 1415, "topic": "Fractions", "sub_topic": "Subtract", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is 5/6 - 1/3?",
"correct_answer": "1/2",
"_wrong_answers": {"fraction_inversion": "4/3"}},
{"id": 1416, "topic": "Fractions", "sub_topic": "Add", "tag": None,
"difficulty": "HARD",
"question_text": "What is 7/12 + 5/8?",
"correct_answer": "29/24",
"_wrong_answers": {"fraction_inversion": "12/20"}},
# ---- Fractions: Multiply ----
{"id": 1421, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
"difficulty": "EASY",
"question_text": "What is 1/2 x 1/3?",
"correct_answer": "1/6",
"_wrong_answers": {"fraction_inversion": "2/3"}},
{"id": 1422, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is 2/3 x 3/4?",
"correct_answer": "1/2",
"_wrong_answers": {"fraction_inversion": "5/7"}},
{"id": 1423, "topic": "Fractions", "sub_topic": "Multiply", "tag": None,
"difficulty": "HARD",
"question_text": "What is 4/5 of 35?",
"correct_answer": "28",
"_wrong_answers": {"fraction_inversion": "20"}},
# ---- Algebra: Simple Equations ----
{"id": 1501, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
"difficulty": "EASY",
"question_text": "Solve x + 7 = 12.",
"correct_answer": "5"},
{"id": 1502, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Solve 3x - 4 = 17.",
"correct_answer": "7",
"_wrong_answers": {"rushed_careless": "8"}},
{"id": 1503, "topic": "Algebra", "sub_topic": "Simple Equations", "tag": None,
"difficulty": "HARD",
"question_text": "Solve 2(x + 3) = 18.",
"correct_answer": "6"},
# ---- Algebra: Sequences ----
{"id": 1511, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
"difficulty": "EASY",
"question_text": "What is the next term: 2, 5, 8, 11, ___ ?",
"correct_answer": "14"},
{"id": 1512, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Find the 10th term of the sequence 4, 7, 10, 13, ...",
"correct_answer": "31"},
{"id": 1513, "topic": "Algebra", "sub_topic": "Sequences", "tag": None,
"difficulty": "HARD",
"question_text": "What is the nth term of the sequence 5, 8, 11, 14, ...?",
"correct_answer": "3n+2"},
# ---- Geometry: Area & Perimeter ----
{"id": 1601, "topic": "Geometry", "sub_topic": "Perimeter", "tag": None,
"difficulty": "EASY",
"question_text": "What is the perimeter of a square with side length 7 cm?",
"correct_answer": "28"},
{"id": 1602, "topic": "Geometry", "sub_topic": "Area", "tag": None,
"difficulty": "MEDIUM",
"question_text": "What is the area of a rectangle 8 cm by 5 cm?",
"correct_answer": "40"},
{"id": 1603, "topic": "Geometry", "sub_topic": "Area", "tag": None,
"difficulty": "HARD",
"question_text": "A right-angled triangle has base 6 cm and height 9 cm. What is its area?",
"correct_answer": "27"},
# ---- Geometry: Angles ----
{"id": 1611, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
"difficulty": "EASY",
"question_text": "Two angles on a straight line are 110° and x. What is x?",
"correct_answer": "70"},
{"id": 1612, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
"difficulty": "MEDIUM",
"question_text": "The angles of a triangle are 45°, 60° and x. What is x?",
"correct_answer": "75"},
{"id": 1613, "topic": "Geometry", "sub_topic": "Angles", "tag": None,
"difficulty": "HARD",
"question_text": "What is the sum of interior angles of a hexagon?",
"correct_answer": "720"},
# ---- Data: Mean / Median / Mode ----
{"id": 1701, "topic": "Data", "sub_topic": "Mean", "tag": None,
"difficulty": "EASY",
"question_text": "Find the mean of 4, 6, 8, 10, 12.",
"correct_answer": "8"},
{"id": 1702, "topic": "Data", "sub_topic": "Median", "tag": None,
"difficulty": "MEDIUM",
"question_text": "Find the median of 3, 7, 2, 9, 5.",
"correct_answer": "5"},
{"id": 1703, "topic": "Data", "sub_topic": "Mode", "tag": None,
"difficulty": "EASY",
"question_text": "Find the mode of 2, 3, 3, 5, 7, 7, 7, 8.",
"correct_answer": "7"},
# ---- Data: Probability ----
{"id": 1711, "topic": "Data", "sub_topic": "Probability", "tag": None,
"difficulty": "MEDIUM",
"question_text": "A bag has 3 red and 5 blue marbles. What is the probability of red?",
"correct_answer": "3/8"},
{"id": 1712, "topic": "Data", "sub_topic": "Probability", "tag": None,
"difficulty": "HARD",
"question_text": "A fair die is rolled. What is the probability of an even number greater than 2?",
"correct_answer": "1/3"},
# ---- Word problems (cross-topic) ----
{"id": 1801, "topic": "Arithmetic", "sub_topic": "Word problem", "tag": "word_problem",
"difficulty": "EASY",
"question_text": "Tom has 24 apples. He gives 9 to his friend. How many does he have left?",
"correct_answer": "15"},
{"id": 1802, "topic": "Fractions", "sub_topic": "Word problem", "tag": "word_problem",
"difficulty": "MEDIUM",
"question_text": "A pizza is cut into 8 slices. Sara eats 1/4 and Tom eats 3/8. What fraction is left?",
"correct_answer": "3/8",
"_wrong_answers": {"word_problem_weak": "1/2", "fraction_inversion": "4/12"}},
{"id": 1803, "topic": "Arithmetic", "sub_topic": "Word problem", "tag": "word_problem",
"difficulty": "MEDIUM",
"question_text": "A train ticket costs £8.50. How much do 6 tickets cost?",
"correct_answer": "51",
"_wrong_answers": {"word_problem_weak": "48", "rushed_careless": "50"}},
{"id": 1804, "topic": "Algebra", "sub_topic": "Word problem", "tag": "word_problem",
"difficulty": "HARD",
"question_text": "Three consecutive numbers add up to 72. What is the smallest number?",
"correct_answer": "23",
"_wrong_answers": {"word_problem_weak": "24"}},
{"id": 1805, "topic": "Geometry", "sub_topic": "Word problem", "tag": "word_problem",
"difficulty": "HARD",
"question_text": "A rectangular garden is 12 m long and 4 m shorter than it is long. What is its area?",
"correct_answer": "96",
"_wrong_answers": {"word_problem_weak": "48"}},
]
# Fill in remaining standard fields for every bank entry.
for q in QUESTION_BANK:
q.setdefault("category", "Math")
q.setdefault("year_level", "Year 6")
q.setdefault("source", "BOOST")
q.setdefault("source_description", None)
q.setdefault("teacher_id", TUTOR["id"])
q.setdefault("maximum_marks", 1)
q.setdefault("rubric", None)
q.setdefault("step_by_step_solution", None)
q.setdefault("image_url", None)
q.setdefault("is_deleted", False)
q.setdefault("created_at", ms(days_ago(40)))
q.setdefault("updated_at", ms(days_ago(40)))
q.setdefault("_wrong_answers", {})
# ---------------------------------------------------------------------------
# Assignments + Assignment Questions
#
# 8 assignments. Distribution: 5 CLOSED (past), 2 PUBLISHED (in-flight),
# 1 DRAFT (future). The deadline-pressure assignment is a Fractions/Add
# assignment due in 5 days — drives the bonus Early Warning topic correlation.
# ---------------------------------------------------------------------------
ASSIGNMENT_DEFS = [
# (id, name, focus_topic, due_offset_days, status, question_bank_ids)
(3001, "HW1 — Place Value Warmup", "Place Value", -28, "CLOSED",
[1001, 1002, 1003, 1101, 1102, 1301, 1401, 1701]),
(3002, "HW2 — Arithmetic Practice", "Arithmetic", -22, "CLOSED",
[1101, 1102, 1103, 1104, 1105, 1106, 1801, 1803]),
(3003, "HW3 — Fractions Foundations", "Fractions", -16, "CLOSED",
[1401, 1402, 1411, 1412, 1413, 1421, 1422, 1802]),
(3004, "HW4 — Negatives & BIDMAS", "BIDMAS", -10, "CLOSED",
[1201, 1202, 1203, 1301, 1302, 1303, 1502, 1701]),
(3005, "HW5 — Geometry Basics", "Geometry", -6, "CLOSED",
[1601, 1602, 1603, 1611, 1612, 1613, 1804, 1805]),
(3006, "HW6 — Algebra & Sequences", "Algebra", 2, "PUBLISHED",
[1501, 1502, 1503, 1511, 1512, 1513, 1804, 1702]),
# The deadline-pressure assignment — bonus Early Warning anchors here.
(3007, "HW7 — Adding Fractions (test prep)", "Fractions", 5, "PUBLISHED",
[1411, 1412, 1413, 1414, 1415, 1416, 1802, 1422]),
(3008, "HW8 — Mixed Revision", "Mixed", 12, "DRAFT",
[1004, 1106, 1303, 1416, 1503, 1613, 1712, 1805]),
]
ASSIGNMENTS = []
ASSIGNMENT_QUESTIONS = []
for aid, name, topic, offset, status, qb_ids in ASSIGNMENT_DEFS:
created_offset = max(offset - 7, -45) # created ~1 week before due
ASSIGNMENTS.append({
"id": aid,
"name": name,
"teacher_id": TUTOR["id"],
"topic": topic,
"due_date": ms(days_ago(-offset, hour=23, minute=59)),
"status": status,
"maximum_marks": len(qb_ids),
"is_deleted": False,
"created_at": ms(days_ago(-created_offset)),
"updated_at": ms(days_ago(-created_offset)),
})
for order, qb_id in enumerate(qb_ids, start=1):
ASSIGNMENT_QUESTIONS.append({
"id": aid * 100 + order,
"assignment_id": aid,
"question_bank_id": qb_id,
"question_order": order,
"maximum_marks": 1,
"rubric": None,
"created_at": ms(days_ago(-created_offset)),
})
# ---------------------------------------------------------------------------
# Assignment Assignees (per student × per assignment) + Student Answers
# ---------------------------------------------------------------------------
QB_BY_ID = {q["id"]: q for q in QUESTION_BANK}
AQ_BY_ID = {aq["id"]: aq for aq in ASSIGNMENT_QUESTIONS}
def assignee_status_for(assignment_status: str, persona: str, aid: int) -> str:
if assignment_status == "DRAFT":
return "NOT_STARTED"
if assignment_status == "PUBLISHED":
# Some students have started it, some not.
return "IN_PROGRESS"
return "SUBMITTED"
# --- Persona-driven correctness/solve-mode generation -----------------------
#
# Each persona is a function that, given the assignment, the question, and
# the assignment's "week index" (0 = oldest, higher = more recent), returns:
# (is_correct, solve_mode, time_seconds, answer_text)
# All randomness flows through RNG (seeded), so output is deterministic.
SOLVE_MODES = ["just_answer", "step_by_step", "solve_together", "handwritten"]
def base_time_for_difficulty(d: str) -> int:
return {"EASY": 60, "MEDIUM": 100, "HARD": 160}[d]
def jitter_time(base: int) -> int:
return max(15, int(base + RNG.randint(-25, 35)))
def pick_mode_default(persona: str, week_idx: int) -> str:
# Most students mostly use just_answer; occasionally step_by_step;
# rarely solve_together.
r = RNG.random()
if r < 0.70:
return "just_answer"
if r < 0.90:
return "step_by_step"
if r < 0.97:
return "solve_together"
return "handwritten"
def answer_for_persona(q: dict, persona: str, force_correct: bool) -> tuple[bool, str]:
"""Return (is_correct, answer_text)."""
if force_correct:
return True, q["correct_answer"]
wrong_map = q.get("_wrong_answers", {}) or {}
if persona in wrong_map:
return False, wrong_map[persona]
# Generic wrong answer.
return False, q["correct_answer"] + "?"
def gen_answer(student: dict, assignment: dict, aq: dict, q: dict, week_idx: int, total_weeks: int):
"""Return a student_answers row dict (or None if assignee hasn't attempted it)."""
persona = student["_persona"]
difficulty = q["difficulty"]
base_time = base_time_for_difficulty(difficulty)
is_word = q.get("tag") == "word_problem"
is_fraction_op = q["topic"] == "Fractions" and q["sub_topic"] in ("Add", "Subtract", "Multiply")
is_place_value = q["topic"] == "Place Value" or (q["topic"] == "Arithmetic" and q["sub_topic"] in ("Subtraction", "Multiplication") and difficulty == "HARD")
# Default: stable_mid baseline.
p_correct = 0.65
solve_mode = pick_mode_default(persona, week_idx)
answered_at_offset_days = 0 # set below
misconception_tag = None
if persona == "fraction_inversion":
if is_fraction_op:
# Sharp misconception: very low on fraction ops, declining.
p_correct = max(0.03, 0.20 - 0.03 * week_idx)
misconception_candidate = "add_tops_add_bottoms" if q["sub_topic"] in ("Add", "Subtract") else "fraction_op_confusion"
elif q["topic"] == "Fractions":
# Equivalent fractions etc: still shaky.
p_correct = 0.25
misconception_candidate = "fraction_general_uncertainty"
else:
p_correct = 0.78
misconception_candidate = None
elif persona == "place_value_gaps":
if is_place_value or (q["topic"] == "Place Value"):
p_correct = 0.25
misconception_candidate = "place_value_misalignment"
else:
p_correct = 0.65
misconception_candidate = None
elif persona == "rushed_careless":
# Right method when forced to slow down (step_by_step), wrong when rushed.
# In just_answer: 40% correct. In step_by_step: 90% correct.
# Time-on-task drops over time (rushing more).
# Solve mode mostly just_answer.
r = RNG.random()
solve_mode = "just_answer" if r < 0.85 else "step_by_step"
if solve_mode == "step_by_step":
p_correct = 0.90
else:
p_correct = 0.40
misconception_candidate = "arithmetic_slip"
# Time decays: week 0 = 0.9 * base, latest = 0.4 * base
t_factor = max(0.4, 0.9 - 0.12 * week_idx)
base_time = int(base_time * t_factor)
elif persona == "solve_together_dependent":
# solve_together usage rises sharply over time. Independent
# accuracy is low and degrading — student is leaning on scaffolding
# more and more.
st_prob = 0.08 + 0.18 * week_idx # week 0 ~8%, week 5 ~98%
st_prob = min(0.92, st_prob)
r = RNG.random()
if r < st_prob:
solve_mode = "solve_together"
p_correct = 0.85
else:
solve_mode = "just_answer" if RNG.random() < 0.7 else "step_by_step"
p_correct = max(0.20, 0.55 - 0.06 * week_idx)
misconception_candidate = "scaffolding_dependence"
elif persona == "word_problem_weak":
if is_word:
p_correct = 0.20
misconception_candidate = "word_problem_interpretation"
else:
p_correct = 0.78
misconception_candidate = None
elif persona == "stable_strong":
p_correct = 0.88 if difficulty != "HARD" else 0.78
misconception_candidate = None
elif persona == "stable_mid":
p_correct = 0.65 if difficulty != "HARD" else 0.50
misconception_candidate = None
elif persona == "stable_weak":
p_correct = 0.55 if difficulty != "HARD" else 0.40
misconception_candidate = None
else:
misconception_candidate = None
# Decide attempted-or-not for IN_PROGRESS assignments.
is_correct = RNG.random() < p_correct
is_correct, answer_text = answer_for_persona(q, persona, force_correct=is_correct)
if not is_correct:
misconception_tag = misconception_candidate
return {
"is_correct": is_correct,
"answer_text": answer_text,
"solve_mode": solve_mode,
"time_on_task_seconds": jitter_time(base_time),
"misconception_tag": misconception_tag,
}
def derive_working_steps(q: dict, ans: dict):
solve_mode = ans["solve_mode"]
correct = bool(ans["is_correct"])
topic = q["topic"]
sub_topic = q.get("sub_topic") or topic
expected = q["correct_answer"]
given = ans["answer_text"]
misconception_tag = ans.get("misconception_tag")
if solve_mode == "just_answer":
return "I solved it mentally and wrote the final answer only."
if solve_mode == "solve_together":
if correct:
return f"I followed guided steps for {topic.lower()} and reached {given}."
return f"I needed guided help on {sub_topic.lower()}, but I still ended with {given} instead of {expected}."
if solve_mode == "handwritten":
if correct:
return f"I worked it out on paper using {sub_topic.lower()} and checked that the final answer was {given}."
return f"I worked it out on paper for {sub_topic.lower()}, but my final answer was {given} instead of {expected}."
if correct:
return f"I used a step-by-step method for {sub_topic.lower()} and got the correct answer {given}."
misconception_explanations = {
"add_tops_add_bottoms": "I added the numerators and denominators directly instead of finding a common denominator.",
"fraction_op_confusion": "I mixed up the fraction rule and used the wrong operation method.",
"fraction_general_uncertainty": "I was unsure which fraction method to use, so my working was inconsistent.",
"place_value_misalignment": "I lined the digits up incorrectly and misread the place value.",
"arithmetic_slip": "My method was close, but I made a careless arithmetic slip in the calculation.",
"scaffolding_dependence": "I could start the method, but I was not secure enough to finish it independently.",
"word_problem_interpretation": "I misunderstood what the word problem was asking me to calculate.",
}
explanation = misconception_explanations.get(
misconception_tag,
f"My method did not lead to the expected answer {expected}.",
)
return f"I tried a step-by-step method for {sub_topic.lower()}, but I got {given}. {explanation}"
def derive_review_fields(q: dict, ans: dict):
correct = bool(ans["is_correct"])
solve_mode = ans["solve_mode"]
misconception_tag = ans["misconception_tag"]
expected = q["correct_answer"]
if correct:
understanding_score = {
"step_by_step": 0.95,
"handwritten": 0.85,
"just_answer": 0.75,
"solve_together": 0.65,
}.get(solve_mode, 0.75)
confidence = {
"step_by_step": 0.82,
"handwritten": 0.78,
"just_answer": 0.9,
"solve_together": 0.62,
}.get(solve_mode, 0.78)
needs_attention = understanding_score < 0.72
issue_reason = (
"Correct answer, but there is limited evidence that the method is secure."
if needs_attention else None
)
ai_feedback_review = (
"Correct answer with clear method evidence."
if solve_mode == "step_by_step"
else "Correct answer, but understanding evidence is lighter because the response is brief."
if needs_attention
else "Correct answer with secure understanding shown."
)
else:
understanding_score = {
"step_by_step": 0.4,
"handwritten": 0.32,
"just_answer": 0.2,
"solve_together": 0.28,
}.get(solve_mode, 0.25)
confidence = {
"step_by_step": 0.55,
"handwritten": 0.6,
"just_answer": 0.72,
"solve_together": 0.5,
}.get(solve_mode, 0.6)
needs_attention = True
issue_reason = {
"add_tops_add_bottoms": "The student added the numerator and denominator directly instead of finding a common denominator.",
"fraction_op_confusion": "The student confused the fraction operation and did not apply the correct method.",
"fraction_general_uncertainty": "The student shows insecure understanding of equivalent or comparable fractions.",
"place_value_misalignment": "The student misread place value, causing digits to be aligned incorrectly.",
"arithmetic_slip": "The final answer is wrong, suggesting a careless arithmetic slip rather than a secure method.",
"scaffolding_dependence": "The student appears dependent on scaffolding and does not show secure independent understanding.",
"word_problem_interpretation": "The student did not translate the word problem into the correct calculation.",
}.get(
misconception_tag,
f"The answer does not match the correct answer ({expected}), showing an incomplete understanding of the method.",
)
ai_feedback_review = f"Incorrect answer compared with the expected answer {expected}. {issue_reason}"
return {
"review_correctness_score": 1.0,
"review_question_score": 1.0,
"review_understanding_score": round(understanding_score, 3),
"review_confidence": round(confidence, 3),
"review_needs_attention": needs_attention,
"review_issue_reason": issue_reason,
"review_tags": [misconception_tag] if misconception_tag else [],
"ai_feedback_review": ai_feedback_review,
}
def build_assignment_review_summary(student_name: str, assignment_name: str, reviews: list[dict]):
if not reviews:
return None, "No submitted work is available for this assignment yet.", None
per_question_scores = []
weak_topics: dict[str, list[float]] = {}
attention_count = 0
correct_count = 0
for review in reviews:
correctness_value = 1.0 if review["is_correct"] else 0.0
understanding_value = review["review_understanding_score"]
question_score = (correctness_value + understanding_value) / 2
per_question_scores.append(question_score)
weak_topics.setdefault(review["topic"], []).append(question_score)
if review["review_needs_attention"]:
attention_count += 1
if review["is_correct"]:
correct_count += 1
overall_score = round(sum(per_question_scores) / len(per_question_scores) * 10, 2)
weakest_topics = sorted(
((topic, sum(values) / len(values)) for topic, values in weak_topics.items()),
key=lambda item: (item[1], item[0]),
)[:2]
weakest_topic_text = ", ".join(topic for topic, _ in weakest_topics) if weakest_topics else "general fluency"
next_step_outcome = "accept" if overall_score >= 6.0 else "support" if overall_score >= 4.5 else "redo"
ai_feedback = (
f"{student_name} completed {assignment_name} with {correct_count}/{len(reviews)} correct responses. "
f"Overall score is {overall_score}/10. "
f"The weakest areas were {weakest_topic_text}. "
f"{attention_count} question(s) need extra attention."
)
return overall_score, ai_feedback, next_step_outcome
# Order assignments oldest -> newest for week_idx threading.
ASSIGNMENT_DEFS_SORTED = sorted(ASSIGNMENT_DEFS, key=lambda a: a[3]) # by due_offset
weeks_total = len(ASSIGNMENT_DEFS_SORTED)
ASSIGNMENT_ASSIGNEES = []
STUDENT_ANSWERS = []
ACTIVITY_LOGS = []
assignee_id_seq = 4000
answer_id_seq = 50000
log_id_seq = 70000
# Define which students "fall behind" on attempts (bonus signal).
LOW_ATTEMPT_RATE_STUDENTS = {203} # Chen Wei: persona=rushed_careless + 8 days no activity
SKIP_RECENT_ASSIGNMENTS_STUDENTS = {203} # student 3 hasn't attempted recent assignments
for week_idx, assignment_def in enumerate(ASSIGNMENT_DEFS_SORTED):
aid, name, topic, offset, status, qb_ids = assignment_def
a_questions = [aq for aq in ASSIGNMENT_QUESTIONS if aq["assignment_id"] == aid]
for student in STUDENTS:
sid = student["id"]
persona = student["_persona"]
# Skip recent assignments for students with low attempt rate
if status != "CLOSED" and sid in SKIP_RECENT_ASSIGNMENTS_STUDENTS:
assignee = {
"id": assignee_id_seq,
"assignment_id": aid,
"student_id": sid,
"classroom_id": CLASSROOM["id"],
"status": "NOT_STARTED",
"started_at": None,
"submitted_at": None,
"total_marks": None,
"is_active": True,
"deactivated_at": None,
"created_at": ms(days_ago(-(offset - 7))),
}
ASSIGNMENT_ASSIGNEES.append(assignee)
assignee_id_seq += 1
continue
# Skip last published assignment for some low-attempt students
if sid in LOW_ATTEMPT_RATE_STUDENTS and status != "CLOSED":
assignee = {
"id": assignee_id_seq,
"assignment_id": aid,
"student_id": sid,
"classroom_id": CLASSROOM["id"],
"status": "NOT_STARTED",
"started_at": None,
"submitted_at": None,
"total_marks": None,
"is_active": True,
"deactivated_at": None,
"created_at": ms(days_ago(-(offset - 7))),
}
ASSIGNMENT_ASSIGNEES.append(assignee)
assignee_id_seq += 1
continue
# In-progress: ~70% have started and answered ~half the questions
if status == "IN_PROGRESS" or status == "PUBLISHED":
started = RNG.random() < 0.85
submitted = False
if not started:
assignee = {
"id": assignee_id_seq,
"assignment_id": aid,
"student_id": sid,
"classroom_id": CLASSROOM["id"],
"status": "NOT_STARTED",
"started_at": None,
"submitted_at": None,
"total_marks": None,
"is_active": True,
"deactivated_at": None,
"created_at": ms(days_ago(-(offset - 7))),
}
ASSIGNMENT_ASSIGNEES.append(assignee)
assignee_id_seq += 1
continue
else:
started = True
submitted = True
# Date the attempts: roughly the day after the assignment was set
attempt_day_offset = max(offset - 5, -42)
started_at = days_ago(-attempt_day_offset, hour=16 + RNG.randint(0, 3), minute=RNG.randint(0, 59))
# For DRAFT, no assignees needed (skip)
if status == "DRAFT":
continue
assignee_id = assignee_id_seq
assignee_id_seq += 1
# Generate answers
total_score = 0
questions_to_answer = a_questions
if status in ("IN_PROGRESS", "PUBLISHED") and not submitted:
# Partial completion
n_done = RNG.randint(max(1, len(a_questions) // 2), len(a_questions))
questions_to_answer = a_questions[:n_done]
running_time_offset = 0
answer_reviews = []
for aq in questions_to_answer:
q = QB_BY_ID[aq["question_bank_id"]]
ans = gen_answer(student, assignment_def, aq, q, week_idx, weeks_total)
working_steps = derive_working_steps(q, ans)
review = derive_review_fields(q, ans)
answered_at = started_at + timedelta(seconds=running_time_offset + ans["time_on_task_seconds"])
running_time_offset += ans["time_on_task_seconds"] + RNG.randint(5, 30)
STUDENT_ANSWERS.append({
"id": answer_id_seq,
"assignee_id": assignee_id,
"assignment_question_id": aq["id"],
"answer_type": "LATEX",
"answer_latex": ans["answer_text"],
"extracted_answer": ans["answer_text"],
"solve_mode": ans["solve_mode"],
"working_steps": working_steps,
"graded_marks": 1 if ans["is_correct"] else 0,
"marks_awarded": 1 if ans["is_correct"] else 0,
"ai_reasoning": (
"Answer matches expected solution." if ans["is_correct"]
else f"Incorrect; expected {q['correct_answer']}."
),
"is_correct": ans["is_correct"],
"ai_feedback": review["ai_feedback_review"],
"review_needs_attention": review["review_needs_attention"],
"review_issue_reason": review["review_issue_reason"],
"review_correctness_score": review["review_correctness_score"],
"review_understanding_score": review["review_understanding_score"],
"review_question_score": review["review_question_score"],
"review_confidence": review["review_confidence"],
"review_tags": review["review_tags"],
"grading_status": "GRADED",
"grading_attempts": 1,
"is_active": True,
"created_at": ms(answered_at),
# ---- Hackathon annotations (not in production schema) ----
"_solve_mode": ans["solve_mode"],
"_time_on_task_seconds": ans["time_on_task_seconds"],
"_is_correct": ans["is_correct"],
"_misconception_tag": ans["misconception_tag"],
"_question_topic": q["topic"],
"_question_sub_topic": q["sub_topic"],
"_question_difficulty": q["difficulty"],
"_answered_at": ms(answered_at),
})
answer_reviews.append({
"topic": q["topic"],
"is_correct": ans["is_correct"],
"review_understanding_score": review["review_understanding_score"],
"review_needs_attention": review["review_needs_attention"],
})
answer_id_seq += 1
total_score += (1 if ans["is_correct"] else 0)
# Activity log
ACTIVITY_LOGS.append({
"id": log_id_seq,
"assignee_id": assignee_id,
"assignment_question_id": aq["id"],
"activity_type": "ANSWERED",
"timestamp": ms(answered_at),
"duration_seconds": ans["time_on_task_seconds"],
"extra_data": {"solve_mode": ans["solve_mode"]},
"created_at": ms(answered_at),
"_student_id": sid,
})
log_id_seq += 1
submitted_at = started_at + timedelta(seconds=running_time_offset) if submitted else None
overall_score, assignee_ai_feedback, next_step_outcome = build_assignment_review_summary(
student["fullname"],
name,
answer_reviews,
)
assignee = {
"id": assignee_id,
"assignment_id": aid,
"student_id": sid,
"classroom_id": CLASSROOM["id"],
"status": "SUBMITTED" if submitted else "IN_PROGRESS",
"started_at": ms(started_at),
"submitted_at": ms(submitted_at) if submitted_at else None,
"total_marks": total_score if submitted else None,
"overall_score": overall_score if submitted else None,
"ai_feedback": assignee_ai_feedback if submitted else None,
"next_step_outcome": next_step_outcome if submitted else None,
"is_active": True,
"deactivated_at": None,
"created_at": ms(days_ago(-(offset - 7))),
}
ASSIGNMENT_ASSIGNEES.append(assignee)
# ---------------------------------------------------------------------------
# Write outputs
# ---------------------------------------------------------------------------
def write_json(name: str, data) -> None:
path = OUT_DIR / name
with path.open("w") as f:
json.dump(data, f, indent=2, default=str)
print(f" wrote {name} ({len(data) if isinstance(data, list) else 'object'} records)")
print("Generating mock dataset...")
write_json("classroom.json", {
"classroom": CLASSROOM,
"tutor": TUTOR,
"classroom_student_rs": CLASSROOM_STUDENT_RS,
})
write_json("students.json", STUDENTS)
write_json("question_bank.json", QUESTION_BANK)
write_json("assignments.json", ASSIGNMENTS)
write_json("assignment_questions.json", ASSIGNMENT_QUESTIONS)
write_json("assignment_assignees.json", ASSIGNMENT_ASSIGNEES)
write_json("student_answers.json", STUDENT_ANSWERS)
write_json("activity_logs.json", ACTIVITY_LOGS)
dataset = {
"_meta": {
"generated_at_utc": TODAY.isoformat(),
"reference_today": TODAY.date().isoformat(),
"schema_source": "elevenplus-backend/src/app/models/",
"subject": "Maths (UK 11+)",
"students": len(STUDENTS),
"assignments": len(ASSIGNMENTS),
"questions_in_bank": len(QUESTION_BANK),
"student_answers": len(STUDENT_ANSWERS),
"expected_top_3_at_risk_student_ids": [201, 203, 204],
},
"classroom": CLASSROOM,
"tutor": TUTOR,
"classroom_student_rs": CLASSROOM_STUDENT_RS,
"students": STUDENTS,
"question_bank": QUESTION_BANK,
"assignments": ASSIGNMENTS,
"assignment_questions": ASSIGNMENT_QUESTIONS,
"assignment_assignees": ASSIGNMENT_ASSIGNEES,
"student_answers": STUDENT_ANSWERS,
"activity_logs": ACTIVITY_LOGS,
}
write_json("dataset.json", dataset)
print("Done.")