Upload and test

This commit is contained in:
MangoPig 2025-12-09 17:25:14 +00:00
parent f9a669ae69
commit dba1acccee
53 changed files with 1756 additions and 107 deletions

2
00-Lesson-Site/.env Normal file
View File

@ -0,0 +1,2 @@
SERVER_FRONTEND_PORT=10020
SERVER_FRONTEND_DEV_PORT=10021

View File

@ -0,0 +1,26 @@
services:
leafpig-lesson-site-frontend-dev:
container_name: leafpig-lesson-site-frontend-dev
build:
context: ./frontend
target: development
ports:
- ${SERVER_FRONTEND_DEV_PORT}:4321
volumes:
- ./frontend:/app
- ./frontend/node_modules:/app/node_modules
env_file:
- .env
profiles:
- dev
leafpig-lesson-site-frontend-prod:
container_name: leafpig-lesson-site-frontend-prod
build:
context: ./frontend
target: production
ports:
- ${SERVER_FRONTEND_PORT}:3000
env_file:
- .env
profiles:
- prod

View File

@ -0,0 +1,43 @@
# Base Image
FROM node:22-alpine AS base
WORKDIR /app
# Enable pnpm
RUN corepack enable && corepack prepare pnpm@latest --activate
# Copy manifest files first to cache dependencies
COPY pnpm-lock.yaml package.json ./
# Install dependencies
RUN pnpm install
# --- Development Stage ---
FROM base AS development
COPY . .
# Astro default port
EXPOSE 4321
# --host is required to expose the server to the container
CMD ["pnpm", "dev", "--host"]
# --- Build Stage ---
FROM base AS build
COPY . .
RUN pnpm build
# --- Production Stage ---
FROM base AS production
WORKDIR /app
# We install 'serve' globally here so we don't rely on node_modules
# This keeps the final image smaller/cleaner
RUN npm install -g serve
# Copy the built output from the build stage
# Astro outputs to 'dist' by default
COPY --from=build /app/dist ./dist
COPY serve.json ./dist
# Expose the port you want for production (e.g., 3000 or 80)
EXPOSE 3000
CMD ["serve", "dist", "-l", "3000", "--single", "--config", "serve.json"]

View File

@ -1,9 +1,8 @@
// @ts-check // @ts-check
import mdx from "@astrojs/mdx"; import mdx from "@astrojs/mdx";
import solidJs from "@astrojs/solid-js"; import solidJs from "@astrojs/solid-js";
import { defineConfig } from "astro/config";
import expressiveCode from "astro-expressive-code"; import expressiveCode from "astro-expressive-code";
import { defineConfig } from "astro/config";
// https://astro.build/config // https://astro.build/config
export default defineConfig({ export default defineConfig({

View File

@ -13,6 +13,7 @@
"@astrojs/solid-js": "^5.1.3", "@astrojs/solid-js": "^5.1.3",
"astro": "^5.16.4", "astro": "^5.16.4",
"astro-expressive-code": "^0.41.3", "astro-expressive-code": "^0.41.3",
"serve": "^14.2.5",
"solid-js": "^1.9.10" "solid-js": "^1.9.10"
}, },
"devDependencies": { "devDependencies": {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
{
"headers": [
{
"source": "/_astro/**",
"headers": [
{
"key": "Cache-Control",
"value": "public, max-age=31536000, immutable"
}
]
},
{
"source": "/fonts/**",
"headers": [
{
"key": "Cache-Control",
"value": "public, max-age=31536000, immutable"
}
]
},
{
"source": "**",
"headers": [
{
"key": "Cache-Control",
"value": "public, max-age=0, must-revalidate"
}
]
}
]
}

View File

@ -17,8 +17,6 @@ import styles from "./DarkModeToggle.module.scss";
</button> </button>
<script> <script>
// 1. Define the logic OUTSIDE the event listener.
// This creates a stable reference that doesn't change on navigation.
const handleToggleClick = () => { const handleToggleClick = () => {
const root = document.documentElement; const root = document.documentElement;
const isDark = root.classList.contains("dark"); const isDark = root.classList.contains("dark");
@ -39,9 +37,6 @@ import styles from "./DarkModeToggle.module.scss";
const toggleBtn = document.getElementById("theme-toggle"); const toggleBtn = document.getElementById("theme-toggle");
if (toggleBtn) { if (toggleBtn) {
// 2. Now this actually works!
// Since 'handleToggleClick' is the exact same function from step 1,
// the browser successfully removes the old one before adding the new one.
toggleBtn.removeEventListener("click", handleToggleClick); toggleBtn.removeEventListener("click", handleToggleClick);
toggleBtn.addEventListener("click", handleToggleClick); toggleBtn.addEventListener("click", handleToggleClick);
} }

View File

@ -0,0 +1,21 @@
---
// Path: src/components/Post/Blockquotes/Ganbatte.astro
import styles from "./Ganbatte.module.scss";
interface Props {
toc?: string;
tocLevel?: string;
imageAlt?: string;
}
const { toc, tocLevel = "1", imageAlt = "MangoPig Ganbatte" } = Astro.props;
---
<blockquote class={styles.ganbatte} data-toc={toc} data-toc-level={tocLevel}>
<slot />
<picture>
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt={imageAlt} />
</picture>
</blockquote>

View File

@ -0,0 +1,41 @@
/* Path: src/components/Post/Blockquotes/Ganbatte.module.scss */
.ganbatte {
background-color: #fff45e1a;
padding: 30px;
border-radius: 10px;
position: relative;
min-height: 100px;
picture {
position: absolute;
bottom: -10px;
right: -10px;
margin: 0;
width: 200px;
max-width: 30%;
transform: rotate(10deg);
img {
width: 100%;
height: auto;
box-shadow: none;
}
}
ul {
list-style-type: disc;
padding-left: 20px;
margin-right: 220px;
}
span {
position: absolute;
top: 50%;
left: 30px;
transform: translateY(-50%);
}
}

View File

@ -0,0 +1,12 @@
---
// Path: src/components/Post/Blockquotes/Important.astro
import styles from "./Important.module.scss";
---
<blockquote class={styles.important}>
<slot />
<picture class={styles.sticker}>
<img src="https://pic.mangopig.tech/i/7eb5b343-5ddf-47ae-a272-4b82ca3d53d7.webp" alt="MangoPig Important" />
</picture>
</blockquote>

View File

@ -0,0 +1,55 @@
/* Path: src/components/Post/Blockquotes/Important.module.scss */
.important {
background-color: #ff5e5e33;
padding: 30px;
border-radius: 10px;
position: relative;
min-height: 100px;
font-weight: 500;
.sticker {
position: absolute;
bottom: 0px;
right: -10px;
margin: 0;
width: 100px;
max-width: 30%;
transform: rotate(10deg);
img {
width: 100%;
height: auto;
box-shadow: none;
}
}
ul {
list-style-type: disc;
padding-left: 20px;
margin-right: 100px;
}
ol {
list-style-type: decimal;
margin-top: 20px;
padding-left: 20px;
margin-right: 100px;
}
p {
margin-right: 100px;
}
span {
// Place in middle vertically
position: absolute;
top: 50%;
left: 30px;
transform: translateY(-50%);
}
}

View File

@ -0,0 +1,12 @@
---
// Path: src/components/Post/Blockquotes/Info.astro
import styles from "./Info.module.scss";
---
<blockquote class={styles.info}>
<slot />
<picture class={styles.sticker}>
<img src="https://pic.mangopig.tech/i/ebf2e26a-8791-4277-90cb-079ad7454aef.webp" alt="MangoPig Ganbattte" />
</picture>
</blockquote>

View File

@ -0,0 +1,53 @@
/* Path: src/components/Post/Blockquotes/Info.module.scss */
.info {
background-color: #5efaff1a;
padding: 30px;
border-radius: 10px;
position: relative;
min-height: 100px;
.sticker {
position: absolute;
bottom: -10px;
right: -10px;
margin: 0;
width: 100px;
max-width: 30%;
transform: rotate(10deg);
img {
width: 100%;
height: auto;
box-shadow: none;
}
}
ul {
list-style-type: disc;
padding-left: 20px;
margin-right: 100px;
}
ol {
list-style-type: decimal;
margin-top: 20px;
padding-left: 20px;
margin-right: 100px;
}
p {
margin-right: 100px;
}
span {
// Place in middle vertically
position: absolute;
top: 50%;
left: 30px;
transform: translateY(-50%);
}
}

View File

@ -0,0 +1,27 @@
---
// Path: src/components/Post/Blockquotes/QA.astro
import styles from "./QA.module.scss";
---
<div class={styles.qaContainer}>
{/* The Question Section */}
<div class={styles.questionHeader}>
<span class={styles.prefix}>Q:</span>
<span class={styles.questionText}>
<slot name="question" />
</span>
</div>
{/* The Answer Section (Default Slot) */}
<div class={styles.answerBody}>
<span class={styles.prefix}>A:</span>
<div class={styles.answerContent}>
<slot />
</div>
</div>
{/* The Sticker */}
<picture class={styles.sticker}>
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt="Thinking MangoPig" />
</picture>
</div>

View File

@ -0,0 +1,78 @@
/* Path: src/components/Post/Blockquotes/QA.module.scss */
.qaContainer {
// Use a yellowish tint to differentiate from Info block
background-color: #fff45e26;
padding: 30px;
border-radius: 10px;
position: relative;
min-height: 120px;
margin-bottom: 20px;
// --- Sticker Logic (Same as Info block) ---
.sticker {
position: absolute;
bottom: -10px;
right: -10px;
margin: 0;
width: 100px;
max-width: 30%;
// Rotate opposite way for variety
transform: rotate(-10deg);
pointer-events: none;
img {
width: 100%;
height: auto;
box-shadow: none;
}
}
//Common prefix style (Q: and A:)
.prefix {
font-weight: 800;
color: #ffbd72; // Matches your H2 color scheme
margin-right: 12px;
display: inline-block;
min-width: 25px;
}
// --- Question Section ---
.questionHeader {
display: flex;
align-items: baseline;
margin-bottom: 20px;
font-size: 1.1em;
font-weight: 700;
// Ensure text doesn't hit the sticker
margin-right: 90px;
color: color-adjust(primary, 0, 0);
}
// --- Answer Section ---
.answerBody {
display: flex;
align-items: baseline;
// Ensure text doesn't hit the sticker
margin-right: 90px;
}
.answerContent {
flex: 1;
line-height: 1.6;
// Handle standard markdown elements inside the answer slot
p {
margin-bottom: 1em;
&:last-child {
margin-bottom: 0;
}
}
ul,
ol {
margin-bottom: 1em;
padding-left: 20px;
}
}
}

View File

@ -18,24 +18,41 @@ import styles from "./FloatingTOC.module.scss";
const targets = content.querySelectorAll("[data-toc]"); const targets = content.querySelectorAll("[data-toc]");
targets.forEach((el, index) => { targets.forEach((el, index) => {
if (!el.id) el.id = `toc-item-${index}`; const label = el.getAttribute("data-toc") || "";
const label = el.getAttribute("data-toc");
const level = el.getAttribute("data-toc-level") || "1"; const level = el.getAttribute("data-toc-level") || "1";
// --- CHANGED SECTION START: ID Generation (Slugify) ---
if (!el.id) {
// Convert "Setting Up Developer Environment" -> "setting-up-developer-environment"
const slug = label
.toLowerCase()
.trim()
.replace(/[^\w\s-]/g, "") // Remove non-word chars
.replace(/[\s_-]+/g, "-") // Replace spaces with dashes
.replace(/^-+|-+$/g, ""); // Trim dashes from start/end
// Safety check: If ID exists or slug is empty, fallback to index
if (!slug || document.getElementById(slug)) {
el.id = `section-${index}`;
} else {
el.id = slug;
}
}
// --- CHANGED SECTION END ---
const li = document.createElement("li"); const li = document.createElement("li");
const a = document.createElement("a"); const a = document.createElement("a");
// Add data attribute for CSS to target specific lengths
li.setAttribute("data-level", level); li.setAttribute("data-level", level);
a.href = `#${el.id}`; a.href = `#${el.id}`;
// No Icon span, just the text which we will hide via CSS
a.innerHTML = `<span class="toc-text">${label}</span>`; a.innerHTML = `<span class="toc-text">${label}</span>`;
a.addEventListener("click", (e) => { a.addEventListener("click", (e) => {
e.preventDefault(); e.preventDefault();
el.scrollIntoView({ behavior: "smooth", block: "center" }); el.scrollIntoView({ behavior: "smooth", block: "center" });
// Optional: Update URL hash without jumping
history.pushState(null, "", `#${el.id}`);
}); });
li.appendChild(a); li.appendChild(a);
@ -45,14 +62,11 @@ import styles from "./FloatingTOC.module.scss";
(entries) => { (entries) => {
entries.forEach((entry) => { entries.forEach((entry) => {
if (entry.isIntersecting) { if (entry.isIntersecting) {
// Clear all active classes
document.querySelectorAll("#toc-list a").forEach((link) => link.classList.remove("active")); document.querySelectorAll("#toc-list a").forEach((link) => link.classList.remove("active"));
// Set current active
a.classList.add("active"); a.classList.add("active");
} }
}); });
}, },
// Tweak: Use a 1px line exactly in the vertical center of the screen
{ rootMargin: "-50% 0px -50% 0px", threshold: 0 } { rootMargin: "-50% 0px -50% 0px", threshold: 0 }
); );

View File

@ -4,6 +4,20 @@
border-left: 4px solid color-adjust(secondary, 0, 0); border-left: 4px solid color-adjust(secondary, 0, 0);
padding: 1rem; padding: 1rem;
background: rgba(128, 128, 128, 0.1); background: rgba(128, 128, 128, 0.1);
button {
background: none;
border: none;
color: color-adjust(primary, 0, 0);
cursor: pointer;
font-weight: bold;
padding: 0;
text-decoration: underline;
&:hover {
opacity: 0.8;
}
}
} }
.spoilerContent { .spoilerContent {

View File

@ -0,0 +1,732 @@
---
# Path: src/content/lessons/01-intro.mdx
title: "Introduction to Web Dev"
description: "Setting up the environment"
style: "type-1"
---
{/* Blockquotes */}
import Ganbatte from "../../components/Post/Blockquotes/Ganbatte.astro";
import Important from "../../components/Post/Blockquotes/Important.astro";
import Info from "../../components/Post/Blockquotes/Info.astro";
import QA from "../../components/Post/Blockquotes/QA.astro";
import Spoiler from "../../components/Post/Spoiler.tsx";
# Hosting a Large Language Model (LLM) Locally
<picture>
<img src="https://pic.mangopig.tech/i/879aaccd-6822-423f-883a-74cf5ba598e7.jpg" alt="Web Development Illustration" />
</picture>
<blockquote class="lesson-meta">
<span>Lesson 01</span>
<span>Created at: **December 2025**</span>
<span>Last Updated: **December 2025**</span>
</blockquote>
<Ganbatte toc="Lesson Objectives" tocLevel="1" imageAlt="MangoPig Ganbatte">
## Lesson Objectives
- Setting up your Developer Environment
- Setting up a isolated Docker environment for hosting LLMs
- Fetching the AI model
- Converting the model to GGUF format
- Quantizing the model for better performance
- Hosting a basic LLM model with llama.cpp locally
- (To Be Added) Making a volume mount to persist LLM data across container restarts
- (To Be Added) Tagging the Docker Image for future reuse
</Ganbatte>
<section data-toc="Setting Up Developer Environment" data-toc-level="1">
<h2>Setting Up Your Developer Environment</h2>
<section data-toc="WSL" data-toc-level="2">
<h3>Setting Up WSL (Windows Subsystem for Linux)</h3>
To set up WSL on your Windows machine, follow these steps:
1. Open PowerShell as Administrator.
2. Run the following command to enable WSL and install a Linux distribution (Ubuntu is recommended):
```zsh frame="none"
wsl --install
```
3. Restart your computer when prompted.
4. After restarting, open the Ubuntu application from the Start menu and complete the initial setup by creating a user account.
5. Update your package lists and upgrade installed packages by running:
```zsh frame="none"
sudo apt update && sudo apt upgrade -y
```
</section>
<section data-toc="ZSH" data-toc-level="2">
<h3>Getting Your Environment Ready</h3>
```zsh frame="none"
sudo apt install -y git make curl sudo zsh
```
```zsh frame="none"
mkdir -p ~/Config/Dotfiles
git clone https://git.mangopig.tech/MangoPig/Dot-Zsh.git ~/Config/Dotfiles/Zsh
cd ~/Config/Dotfiles/Zsh
```
Whenever there's a prompt to ask to install just confirm with `y` and hit enter.
```zsh frame="none"
make setup
```
Restart the shell to finalize the zsh setup:
```zsh frame="none"
zsh
```
With the above commands, you should have a zsh environment, coding language and Docker setup. We will get more in details of all the tools with this setup as we work through the lessons.
</section>
<section data-toc="Docker" data-toc-level="2">
<h3>Installing Docker</h3>
Docker should already be installed with the above steps. To verify, run:
```zsh frame="none"
docker --version
```
and try to run a test container:
```zsh frame="none"
docker run hello-world
```
If you run into permissions issues, you may need to add your user to the docker group:
```zsh frame="none"
sudo usermod -aG docker $USER
```
Then restart the shell or log out and back in by doing:
```zsh frame="none"
zsh
```
</section>
</section>
<section data-toc="Docker Environment Setup" data-toc-level="1">
<h2>Setting Up the Isolated Docker Environment for Hosting LLMs</h2>
Now that we have the local environment ready, we want to set up an isolated Docker environment for hosting LLMs so that it doesn't interfere with our main system.
<section data-toc="What is Docker?" data-toc-level="2">
<h3>What is Docker?</h3>
Docker is a platform that allows you to package your application and its dependencies into containers.
<Info>
<span>You can find more Docker Images on <a href="https://hub.docker.com/">Docker Hub</a>.</span>
</Info>
<section data-doc="Installing Docker" data-doc-level="3">
<h4>Installing Docker</h4>
</section>
</section>
<section data-toc="Creating Docker Container" data-toc-level="2">
<h3>Creating the Docker Container</h3>
For our current purpose, we will be using the official <a href="https://hub.docker.com/r/nvidia/cuda/tags">NVIDIA Docker image</a> so that we can leverage CUDA for GPU acceleration if available.
We will create the Docker container and make it interactive by running:
```zsh frame="none"
docker run --gpus all -it --name llm-container -p 8080:8080 nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04 /bin/bash
```
<Info>
- `--gpus` all enables GPU support for the container.
- `--it` makes the container interactive, allowing you to run commands inside it.
- `--name` llm-container gives the container a name for easier reference.
- `-p 8080:8080` = `-p HOST:CONTAINER` maps port 8080 on your host machine to port 8080 inside the container. This is useful if you plan to run a server inside the container and want to access it from your host machine.
- `nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04` specifies the Docker image to use.
- `/bin/bash` starts a bash shell inside the container.
</Info>
Once you are inside the container, you can proceed to setup the environment like we did before in the <a href="#setting-up-developer-environment">WSL section</a>.
<Info>
There's a few things you need to do before you can setup the Environment like we did before:
1. Update the package lists and install necessary packages:
```zsh frame="none"
apt update && apt install -y git make curl sudo zsh
```
2. Remove the default user (usually `ubuntu`) to avoid permission issues:
```zsh frame="none"
userdel -r ubuntu
```
3. Run my provisional script to setup users and permissions:
```zsh frame="none"
bash <(curl -s https://git.mangopig.tech/mangopig/Dot-Zsh/raw/branch/main/scripts/provision.sh)
```
You should create your own user when prompted, make it have 1000 as UID and GID for consistency and please remember the password you set here as you'll need it to use `sudo` later on.
4. Now change users by doing: **(replace `your-username` with the username you created)**
```zsh frame="none"
su - your-username
```
OR you can exit the container and reattach with the new user by doing:
```zsh frame="none"
exit
docker start llm-container
docker exec -it --user your-username llm-container /bin/zsh
```
Press `q` when they prompt you to create a zsh configuration file.
5. Now you can proceed to setup zsh and the rest of the environment as shown in the [previous section](#zsh).
</Info>
Try to do this on your own first! If you get stuck, you can check the solution below.
<Spoiler client:idle >
## Solution
1. Update the package lists and install necessary packages:
```zsh frame="none"
apt update && apt install -y git make curl sudo zsh
```
2. Remove the default user (usually `ubuntu`) to avoid permission issues:
```zsh frame="none"
userdel -r ubuntu
```
3. Run my provisional script to setup users and permissions:
```zsh frame="none"
bash <(curl -s https://git.mangopig.tech/mangopig/Dot-Zsh/raw/branch/main/scripts/provision.sh)
```
You should create your own user when prompted, make it have 1000 as UID and GID for consistency and please remember the password you set here as you'll need it to use `sudo` later on.
4. Now change users by doing: **(replace `your-username` with the username you created)**
```zsh frame="none"
su - your-username
```
OR you can exit the container and reattach with the new user by doing:
```zsh frame="none"
exit
docker start llm-container
docker exec -it --user your-username llm-container /bin/zsh
```
Press `q` when they prompt you to create a zsh configuration file.
5. Go into the dotfiles directory and setup zsh:
```zsh frame="none"
cd ~/Config/Dot-Zsh
make base && \
make python && \
make clean && \
make stow
```
6. Restart the shell to finalize the zsh setup:
```zsh frame="none"
zsh
```
7. Verify that Pyenv and Miniforge is working by:
```zsh frame="none"
pyenv --version
conda --version
```
</Spoiler>
</section>
</section>
<section data-toc="Python Setup" data-toc-level="1">
<h2>Setting Up Python Environment</h2>
Now that we have the Docker container set up, we can proceed to set up the environment to run llama.cpp inside the container.
We have setup `pyenv` and `Miniforge` as part of the zsh setup. You can verify that they are working by running:
```zsh frame="none"
pyenv --version
conda --version
```
`pyenv` allows us to manage multiple Python versions easily. We can easily install different versions of Python and Conda environments as needed for different projects.
`conda` (via Miniforge) allows us to create isolated Python environments, which is helpful for making sure that the dependencies for llama.cpp do not interfere with other projects.
Let's first create a directory for llama.cpp and navigate into it:
```zsh frame="none"
mkdir -p ~/Projects/llama.cpp
cd ~/Projects/llama.cpp
```
Now, let's clone the llama.cpp repository:
```zsh frame="none"
git clone https://github.com/ggerganov/llama.cpp.git .
```
<Info>
- You can also the contents of the repository with `ls -la`
- The `.` at the end of the git clone command ensures that the contents of the repository are cloned directly into the current directory.
- For convenience, you can find the official llama.cpp repository at <a href="https://github.com/ggml-org/llama.cpp?tab=readme-ov-file">llama.cpp GitHub</a>
</Info>
With the repository cloned, we can now proceed to build the llama.cpp.
We first use `cmake` to configure the build system. It's like telling the app what our computer environment looks like and what options we want to enable.
```zsh frame="none"
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON
```
<Info>
- `-S .` tells cmake where to find the source files (in this case, the current directory).
- `-B build` specifies where all the temperary build files will go (in a folder named `build`).
- `-G Ninja` tells cmake to use the Ninja build system.
- `-DCMAKE_BUILD_TYPE=Release` sets the build type to Release for optimized performance.
- `-DCMAKE_INSTALL_PREFIX=/your/install/dir` specifies where to install the built files. You can change this to your desired installation path.
- `-DLLAMA_BUILD_TESTS=OFF` disables building tests.
- `-DLLAMA_BUILD_EXAMPLES=ON` enables building example programs.
- `-DLLAMA_BUILD_SERVER=ON` enables building the server component.
</Info>
Now we can build the project, this step is basically taking what we told cmake to do and actually making it into executable files.
```zsh frame="none"
cmake --build build --config Release -j $(nproc)
```
<Info>
- `--build build` tells cmake to build the project using the files in the `build` directory. (where we set with -B in the previous step)
- `--config Release` specifies that we want to build the Release version.
- `-j $(nproc)` tells cmake to use all available CPU cores for faster building.
- `$(nproc)` is a command that returns the number of processing units available.
</Info>
After we are doing building, the binaries will be located in the `build/bin` directory. We want to move it to a more accessible location (`/usr/local` that we specified earlier), so we can run it easily. We can do this by running:
```zsh frame="none"
sudo cmake --install build && \
sudo ldconfig
```
<Info>
- `--install build` tells cmake to install the built files from the `build` directory to the location we specified earlier with `-DCMAKE_INSTALL_PREFIX`.
- `sudo ldconfig` updates the system's library cache to recognize the newly installed binaries.
</Info>
Now you should be able to run the `llama.cpp` binary from anywhere, you can check what llama.cpp options are available by running:
```zsh frame="none"
ls /usr/local/bin
```
```zsh frame="none"
󰡯 bat 󰡯 llama-eval-callback 󰡯 llama-lookup 󰡯 llama-save-load-state
󰡯 convert_hf_to_gguf.py 󰡯 llama-export-lora 󰡯 llama-lookup-create 󰡯 llama-server
󰡯 fd 󰡯 llama-finetune 󰡯 llama-lookup-merge 󰡯 llama-simple
󰡯 llama-batched 󰡯 llama-gen-docs 󰡯 llama-lookup-stats 󰡯 llama-simple-chat
󰡯 llama-batched-bench 󰡯 llama-gguf 󰡯 llama-mtmd-cli 󰡯 llama-speculative
󰡯 llama-bench 󰡯 llama-gguf-hash 󰡯 llama-parallel 󰡯 llama-speculative-simple
󰡯 llama-cli 󰡯 llama-gguf-split 󰡯 llama-passkey 󰡯 llama-tokenize
󰡯 llama-convert-llama2c-to-ggml 󰡯 llama-idle 󰡯 llama-perplexity 󰡯 llama-tts
󰡯 llama-cvector-generator 󰡯 llama-imatrix 󰡯 llama-quantize
󰡯 llama-diffusion-cli 󰡯 llama-logits 󰡯 llama-retrieval
󰡯 llama-embedding 󰡯 llama-lookahead 󰡯 llama-run
```
We can further verify whether we can run `llama.cpp` by checking its version:
```zsh frame="none"
llama-cli --version
```
```zsh frame="none"
version: 7327 (c8554b66e)
built with GNU 13.3.0 for Linux x86_64
```
</section>
<section data-toc="Getting the AI" data-toc-level="1">
<h2>Fetching the AI Model Weights</h2>
Now that we have llama.cpp set up, we need to get some AI models to run with it.
The main place to get models is from [Hugging Face](https://huggingface.co/). You will need to create an account if you don't have one already.
Once you have created an account, you should also setup your access token by going:
<picture>
<img src="https://pic.mangopig.tech/i/aea54c8e-9dd5-44c7-ab1f-6b57b076e7d8.webp" alt="Hugging Face Access Token" />
</picture>
And then give your token all the `read` permissions.
<picture>
<img src="https://pic.mangopig.tech/i/4360ee94-7f37-4897-91e9-882fd198b8b3.webp" alt="Hugging Face Token Permissions" />
</picture>
<Important>
Make sure to copy the token somewhere safe and **DO NOT SHARE IT WITH ANYONE** or **USE IT DIRECTLY IN PUBLIC REPOSITORIES** and **DIRECTLY IN YOUR CODE**! Consult AIs on how to keep your tokens safe if you are unsure, but do not directly share them with the AI.
</Important>
Now that you have your token, you can use it to download models from Hugging Face. We will use `huggingface-cli` to do this. Let's first make the directory to store the models:
```zsh frame="none"
mkdir -p ~/Models
cd ~/Models
```
We can then install `huggingface-cli`
```zsh frame="none"
curl -LsSf https://hf.co/cli/install.sh | bash
```
We will then login to Hugging Face using the CLI and provide our access token when prompted:
```zsh frame="none"
git config --global credential.helper store
```
```zsh frame="none"
hf auth login
```
```zsh frame="none"
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): INPUT_YOUR_TOKEN_HERE
Add token as git credential? [y/N]: y
Token is valid (permission: fineGrained).
The token `temp` has been saved to /home/mangopig/.cache/huggingface/stored_tokens
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/mangopig/.cache/huggingface/token
Login successful.
The current active token is: `temp`
```
Now you can download models using the `hf download` command. I will be using the [`SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B) following this tutorial but if the model is too large for your system, you can choose a smaller model from Hugging Face, such as [`SmolLM2-1.7B`](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B) or [`SmolLM2-360M`](https://huggingface.co/HuggingFaceTB/SmolLM2-360M).
```zsh frame="none"
hf download HuggingFaceTB/SmolLM3-3B --local-dir ~/Models/SmolLM3-3B
```
<Info>
- `HuggingFaceTB/SmolLM3-3B` is the model identifier on Hugging Face. Get it from clicking the button to copy the name in the image below:
<picture>
<img src="https://pic.mangopig.tech/i/674714b4-736b-429c-b198-c9d57ba8bdee.webp" alt="Hugging Face Model Page" />
</picture>
- `--local-dir ~/Models/SmolLM3-3B` specifies where to save the downloaded model.
You can find out more about what options you can use with `hf download` by doing `hf download --help`.
```zsh frame="none"
> hf download --help
Usage: hf download [OPTIONS] REPO_ID [FILENAMES]...
Download files from the Hub.
Arguments:
REPO_ID The ID of the repo (e.g. `username/repo-name`). [required]
[FILENAMES]... Files to download (e.g. `config.json`,
`data/metadata.jsonl`).
Options:
--repo-type [model|dataset|space]
The type of repository (model, dataset, or
space). [default: model]
--revision TEXT Git revision id which can be a branch name,
a tag, or a commit hash.
--include TEXT Glob patterns to include from files to
download. eg: *.json
--exclude TEXT Glob patterns to exclude from files to
download.
--cache-dir TEXT Directory where to save files.
--local-dir TEXT If set, the downloaded file will be placed
under this directory. Check out https://hugg
ingface.co/docs/huggingface_hub/guides/downl
oad#download-files-to-local-folder for more
details.
--force-download / --no-force-download
If True, the files will be downloaded even
if they are already cached. [default: no-
force-download]
--dry-run / --no-dry-run If True, perform a dry run without actually
downloading the file. [default: no-dry-run]
--token TEXT A User Access Token generated from
https://huggingface.co/settings/tokens.
--quiet / --no-quiet If True, progress bars are disabled and only
the path to the download files is printed.
[default: no-quiet]
--max-workers INTEGER Maximum number of workers to use for
downloading files. Default is 8. [default:
8]
--help Show this message and exit.
```
</Info>
With this, we have a model downloaded at `~/Models/SmolLM3-3B`. We can now proceed to try to run the model with llama.cpp.
</section>
<section data-toc="Converting Model to GGUF" data-toc-level="1">
<h2>Converting the Model to GGUF</h2>
<p>After downloading the model from Hugging Face, we need to convert it to the GGUF format so that llama.cpp can use it.</p>
<p>Hugging Face usually store their models in the `.safetensors` format</p>
<p>However, `llama.cpp` usually expect the models to be in the `.gguf` format.</p>
<p>So we will need to convert the models to `.gguf`. Luckily, `llama.cpp` comes with a python script that helps us to do just that.</p>
<p>We will first create a `Python` environment with `Conda` and activate it</p>
```zsh frame="none"
conda create -n llama-cpp python=3.10 -y
conda activate llama-cpp
python -m pip install --upgrade pip wheel setuptools
```
<Info>
- `conda create -n llama-cpp python=3.10 -y` creates a new conda environment named `llama-cpp` with Python 3.10 installed
- `-n`: Specifies the name of the environment.
- `python=3.10`: Specifies the Python version to install in the environment.
- `-y`: Automatically confirms the creation.
- `conda activate llama-cpp` activates the newly created conda environment.
- `python -m pip install --upgrade pip wheel setuptools`
- We are updating `pip`, `wheel`, and `setuptools`
- `pip`: The package installer for Python. Similar to `npm` and `go get` in other languages.
- `wheel`: A built-package format for Python.
- `setuptools`: A package development and distribution library for Python.
</Info>
<p>`conda` is used to isolate the dependencies needed for the conversion process so that it doesn't interfere with other projects.</p>
<p>We will then install the dependencies for `llama.cpp`</p>
```zsh frame="none"
pip install --upgrade -r ~/Projects/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt
```
<Info>
- `pip install`: Installs Python packages.
- `--upgrade`: Upgrades the packages to the latest versions.
- `-r`: Specifies that we are installing packages from a requirements file.
- `~/Projects/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt`: The path to the requirements file that contains the list of packages needed for converting models to GGUF format.
</Info>
Nice! Now we are ready to convert the model to GGUF format. We can do this by running the conversion script provided by `llama.cpp`
```zsh frame="none"
python ~/Projects/llama.cpp/convert_hf_to_gguf.py \
~/Models/SmolLM3-3B \
--outfile ~/Models/SmolLM3-3B/SmolLM3-3B.gguf
```
<Info>
- `python ~/Projects/llama.cpp/convert_hf_to_gguf.py`: `python` runs the conversion script located at `~/Projects/llama.cpp/scripts/convert_hf_to_gguf.py`.
- `~/Models/SmolLM3-3B`: Specifies the path to the downloaded model in Hugging Face format.
- `--outfile ~/Models/SmolLM3-3B/SmolLM3-3B.gguf`: Specifies where to save the converted model in GGUF format.
</Info>
When you see a similar output to:
```zsh frame="none"
INFO:hf-to-gguf:Model successfully exported to SmolLM3-3B.gguf
```
Then you have succeeded in converting the model to GGUF format!
</section>
<section data-toc="Quantizing the Model" data-toc-level="1">
<h2>Quantizing the Model for Better Performance</h2>
<p>Quantization is a technique used to reduce the size of the model and improve inference speed and VRAM requirements by compressing and reducing the model's weight</p>
We can learn what quantization `llama.cpp` supports by running:
```zsh frame="none"
llama-quantize --help
```
```zsh frame="none"
usage: llama-quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]
[--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]
model-f32.gguf [model-quant.gguf] type [nthreads]
--allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
--leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
--pure: Disable k-quant mixtures and quantize all tensors to the same type
--imatrix file_name: use data in file_name as importance matrix for quant optimizations
--include-weights tensor_name: use importance matrix for this/these tensor(s)
--exclude-weights tensor_name: use importance matrix for this/these tensor(s)
--output-tensor-type ggml_type: use this ggml_type for the output.weight tensor
--token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor
--tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0
Advanced option to selectively quantize tensors. May be specified multiple times.
--prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model
Advanced option to remove all tensors from the given layers
--keep-split: will generate quantized model in the same shards as input
--override-kv KEY=TYPE:VALUE
Advanced option to override model metadata by key in the quantized model. May be specified multiple times.
Note: --include-weights and --exclude-weights cannot be used together
Allowed quantization types:
2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B
3 or Q4_1 : 4.78G, +0.4511 ppl @ Llama-3-8B
38 or MXFP4_MOE : MXFP4 MoE
8 or Q5_0 : 5.21G, +0.1316 ppl @ Llama-3-8B
9 or Q5_1 : 5.65G, +0.1062 ppl @ Llama-3-8B
19 or IQ2_XXS : 2.06 bpw quantization
20 or IQ2_XS : 2.31 bpw quantization
28 or IQ2_S : 2.5 bpw quantization
29 or IQ2_M : 2.7 bpw quantization
24 or IQ1_S : 1.56 bpw quantization
31 or IQ1_M : 1.75 bpw quantization
36 or TQ1_0 : 1.69 bpw ternarization
37 or TQ2_0 : 2.06 bpw ternarization
10 or Q2_K : 2.96G, +3.5199 ppl @ Llama-3-8B
21 or Q2_K_S : 2.96G, +3.1836 ppl @ Llama-3-8B
23 or IQ3_XXS : 3.06 bpw quantization
26 or IQ3_S : 3.44 bpw quantization
27 or IQ3_M : 3.66 bpw quantization mix
12 or Q3_K : alias for Q3_K_M
22 or IQ3_XS : 3.3 bpw quantization
11 or Q3_K_S : 3.41G, +1.6321 ppl @ Llama-3-8B
12 or Q3_K_M : 3.74G, +0.6569 ppl @ Llama-3-8B
13 or Q3_K_L : 4.03G, +0.5562 ppl @ Llama-3-8B
25 or IQ4_NL : 4.50 bpw non-linear quantization
30 or IQ4_XS : 4.25 bpw non-linear quantization
15 or Q4_K : alias for Q4_K_M
14 or Q4_K_S : 4.37G, +0.2689 ppl @ Llama-3-8B
15 or Q4_K_M : 4.58G, +0.1754 ppl @ Llama-3-8B
17 or Q5_K : alias for Q5_K_M
16 or Q5_K_S : 5.21G, +0.1049 ppl @ Llama-3-8B
17 or Q5_K_M : 5.33G, +0.0569 ppl @ Llama-3-8B
18 or Q6_K : 6.14G, +0.0217 ppl @ Llama-3-8B
7 or Q8_0 : 7.96G, +0.0026 ppl @ Llama-3-8B
1 or F16 : 14.00G, +0.0020 ppl @ Mistral-7B
32 or BF16 : 14.00G, -0.0050 ppl @ Mistral-7B
0 or F32 : 26.00G @ 7B
COPY : only copy tensors, no quantizing
```
<Info>
For a line `2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B`
- `2` and `Q4_0` are the identifiers you can use to specify the quantization type.
- `4.34G` indicates the size of the quantized model.
- `+0.4685 ppl` indicates the increase in perplexity (a measure of model performance; lower is better) when using this quantization type
</Info>
<QA>
<span slot="question">How do I know how big of a model size can I fit in my computer</span>
<p>It depends on whether you are running inference on your <strong>CPU (System RAM)</strong> or <strong>GPU (VRAM)</strong>.</p>
<p>For CPU inference, you generally want the model size to be around 2x the size of your system RAM for comfortable operation. For example, if you have 16GB of RAM, you should aim for models that are around 8GB or smaller.</p>
**Size (GB) ≈ (Parameters (Billions) × Bits Per Weight) / 8 + Overhead**
- Bits Per Weight (bpw):
- Qx = x bits per weight
- Qx_K = K quants will keep some important weights at higher precision (Q4_K ≈ 5 bits per weight, Q5_K ≈ 6 bits per weight, Q6_K ≈ 7 bits per weight)
- Qx_K_S = Small K quants
- Qx_K_M = Medium K quants
- Qx_K_L = Large K quants
- IQx = Integer Quantization with x bits per weight, bpw is on the chart
- TQx = Ternary Quantization with x bits per weight, bpw is on the chart
</QA>
**TO BE ADDED** - Quantization Calculator
Once we have decided what quantization type to use, we can proceed to quantize the model by running:
```zsh frame="none"
llama-quantize \
~/Models/SmolLM3-3B/SmolLM3-3B.gguf \
~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf \
q4_0
4
```
<Info>
- `llama-quantize`: The command to run the quantization process.
- `~/Models/SmolLM3-3B/SmolLM3-3B.gguf`: The path to the original GGUF model that we want to quantize.
- `~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf`: The path where we want to save the quantized model.
- `q4_0`: The quantization type we want to use (in this case, Q4_0).
- `4`: Number of threads to use for quantization (optional, defaults to number of CPU cores).
</Info>
<p>After the quantization is complete, you should see a new file named `SmolLM3-3B.q4.gguf` in the model directory.</p>
<p>We can now learn how to serve the model with `llama.cpp`</p>
</section>
<section data-toc="Inferencing the Model" data-toc-level="1">
<h2>Inferencing the Model</h2>
<p>Now that we have the model ready, we can proceed to run inference with it using `llama.cpp`.</p>
<p>`llama.cpp` provides us with multiple ways of inferencing, we can: </p>
- Use the command line interface (CLI) to interact with the model directly from the terminal. (llama-cli)
- Use the server mode to host the model and interact with it via HTTP requests. (llama-server)
For this tutorial, we will use the `llama-server` to serve the model.
To start the server with our quantized model, we can run:
```zsh frame="none"
llama-server \
--model ~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf \
--host 0.0.0.0 \
--port 8080
```
<Info>
- `llama-server`: The command to start the server.
- `--model ~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf`: Specifies the path to the quantized model we want to serve.
- `--host 0.0.0.0`: This makes the server accessible from any IP address.
- `--port 8080`: Specifies the port on which the server will listen for incoming requests.
You can read all the options you can customize to run the server [here](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
</Info>
As soon as you see this
```zsh frame="none"
main: model loaded
main: server is listening on http://0.0.0.0:8080
main: starting the main loop...
```
Your server is up and running! You can now interact with the model by going to [`http://localhost:8080`](http://localhost:8080) in your web browser or using tools like `curl` for API requests.
Open another terminal window and use this example for API request using `curl`:
```zsh frame="none"
curl \
--request POST \
--url http://localhost:8080/completion \
--header "Content-Type: application/json" \
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
```
<Info>
- `--request POST`: Specifies that we are making a POST request. (We will get into REST HTTP APIs in future tutorials)
- `--url http://localhost:8080/completion`: The URL of the server endpoint for completions.
- `--header "Content-Type: application/json"`: Sets the content type to JSON.
- `--data '{...}'`: The JSON payload containing the prompt and other parameters for the model.
Read more about the API requests [here](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#using-with-curl)
</Info>
</section>

View File

@ -36,6 +36,25 @@ h2 {
color: #ffbd72; color: #ffbd72;
} }
h3 {
font-size: 1.75em;
color: #aad4fc;
}
h4 {
font-size: 1.5em;
color: #9dffb8;
}
a {
color: #ffb8b8;
text-decoration: underline;
&:hover {
color: #ff5e5e;
}
}
picture { picture {
display: block; display: block;
margin: 20px auto; margin: 20px auto;
@ -53,6 +72,10 @@ blockquote {
line-height: 1.6; line-height: 1.6;
margin-bottom: 15px; margin-bottom: 15px;
text-align: justify; text-align: justify;
p {
margin-bottom: 0;
}
} }
strong { strong {
@ -64,35 +87,10 @@ strong {
margin: 20px !important; margin: 20px !important;
} }
.objectives { code {
background-color: #fff45e1a; background-color: color-adjust(background, 0.02, 0.02);
padding: 30px; color: color-adjust(primary, 0, 0);
border-radius: 10px; padding: 2px 4px;
border-radius: 4px;
position: relative; font-family: "GeistMono", monospace;
min-height: 100px;
picture {
position: absolute;
bottom: -10px;
right: -10px;
margin: 0;
width: 140px;
max-width: 30%;
transform: rotate(10deg);
img {
width: 100%;
height: auto;
box-shadow: none;
}
}
ul {
list-style-type: disc;
padding-left: 20px;
}
} }

View File

@ -1,61 +0,0 @@
---
# Path: src/content/lessons/01-intro.mdx
title: "Introduction to Web Dev"
description: "Setting up the environment"
style: "type-1"
---
import Spoiler from "../../components/Post/Spoiler.tsx";
# Hosting a Large Language Model (LLM) Locally
<picture>
<img src="https://pic.mangopig.tech/i/879aaccd-6822-423f-883a-74cf5ba598e7.jpg" alt="Web Development Illustration" />
</picture>
<blockquote class="lesson-meta">
<span>Lesson 01</span>
<span>Created at: **December 2025**</span>
<span>Last Updated: **December 2025**</span>
</blockquote>
<blockquote class="objectives" data-toc="Lesson Objectives">
## Lesson Objectives
- Setting up your Developer Environment
- Setting up a isolated Docker environment for hosting LLMs
- Introduction to basic Python environment setup
- Hosting a basic LLM model with Llama.cpp locally
<picture>
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt="MangoPig Ganbattte" />
</picture>
</blockquote>
<section data-toc="Setting Up Developer Environment">
<h2 data-toc="WSL" data-toc-level="2">Setting Up WSL (Windows Subsystem for Linux)</h2>
To set up WSL on your Windows machine, follow these steps:
1. Open PowerShell as Administrator.
2. Run the following command to enable WSL and install a Linux distribution (Ubuntu is recommended):
```zsh frame="none"
wsl --install
```
3. Restart your computer when prompted.
4. After restarting, open the Ubuntu application from the Start menu and complete the initial setup by creating a user account.
5. Update your package lists and upgrade installed packages by running:
```zsh frame="none"
sudo apt update && sudo apt upgrade -y
```
<h2 data-toc="Getting Your Environment Ready" data-toc-level="2">Getting Your Environment Ready</h2>
```wsl frame="none"
</section>