Upload and test
This commit is contained in:
parent
f9a669ae69
commit
dba1acccee
2
00-Lesson-Site/.env
Normal file
2
00-Lesson-Site/.env
Normal file
@ -0,0 +1,2 @@
|
||||
SERVER_FRONTEND_PORT=10020
|
||||
SERVER_FRONTEND_DEV_PORT=10021
|
||||
26
00-Lesson-Site/docker-compose.yaml
Normal file
26
00-Lesson-Site/docker-compose.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
services:
|
||||
leafpig-lesson-site-frontend-dev:
|
||||
container_name: leafpig-lesson-site-frontend-dev
|
||||
build:
|
||||
context: ./frontend
|
||||
target: development
|
||||
ports:
|
||||
- ${SERVER_FRONTEND_DEV_PORT}:4321
|
||||
volumes:
|
||||
- ./frontend:/app
|
||||
- ./frontend/node_modules:/app/node_modules
|
||||
env_file:
|
||||
- .env
|
||||
profiles:
|
||||
- dev
|
||||
leafpig-lesson-site-frontend-prod:
|
||||
container_name: leafpig-lesson-site-frontend-prod
|
||||
build:
|
||||
context: ./frontend
|
||||
target: production
|
||||
ports:
|
||||
- ${SERVER_FRONTEND_PORT}:3000
|
||||
env_file:
|
||||
- .env
|
||||
profiles:
|
||||
- prod
|
||||
43
00-Lesson-Site/frontend/Dockerfile
Normal file
43
00-Lesson-Site/frontend/Dockerfile
Normal file
@ -0,0 +1,43 @@
|
||||
# Base Image
|
||||
FROM node:22-alpine AS base
|
||||
WORKDIR /app
|
||||
|
||||
# Enable pnpm
|
||||
RUN corepack enable && corepack prepare pnpm@latest --activate
|
||||
|
||||
# Copy manifest files first to cache dependencies
|
||||
COPY pnpm-lock.yaml package.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN pnpm install
|
||||
|
||||
# --- Development Stage ---
|
||||
FROM base AS development
|
||||
COPY . .
|
||||
# Astro default port
|
||||
EXPOSE 4321
|
||||
# --host is required to expose the server to the container
|
||||
CMD ["pnpm", "dev", "--host"]
|
||||
|
||||
# --- Build Stage ---
|
||||
FROM base AS build
|
||||
COPY . .
|
||||
RUN pnpm build
|
||||
|
||||
# --- Production Stage ---
|
||||
FROM base AS production
|
||||
WORKDIR /app
|
||||
|
||||
# We install 'serve' globally here so we don't rely on node_modules
|
||||
# This keeps the final image smaller/cleaner
|
||||
RUN npm install -g serve
|
||||
|
||||
# Copy the built output from the build stage
|
||||
# Astro outputs to 'dist' by default
|
||||
COPY --from=build /app/dist ./dist
|
||||
COPY serve.json ./dist
|
||||
|
||||
# Expose the port you want for production (e.g., 3000 or 80)
|
||||
EXPOSE 3000
|
||||
|
||||
CMD ["serve", "dist", "-l", "3000", "--single", "--config", "serve.json"]
|
||||
@ -1,9 +1,8 @@
|
||||
// @ts-check
|
||||
import mdx from "@astrojs/mdx";
|
||||
import solidJs from "@astrojs/solid-js";
|
||||
import { defineConfig } from "astro/config";
|
||||
|
||||
import expressiveCode from "astro-expressive-code";
|
||||
import { defineConfig } from "astro/config";
|
||||
|
||||
// https://astro.build/config
|
||||
export default defineConfig({
|
||||
@ -13,6 +13,7 @@
|
||||
"@astrojs/solid-js": "^5.1.3",
|
||||
"astro": "^5.16.4",
|
||||
"astro-expressive-code": "^0.41.3",
|
||||
"serve": "^14.2.5",
|
||||
"solid-js": "^1.9.10"
|
||||
},
|
||||
"devDependencies": {
|
||||
File diff suppressed because it is too large
Load Diff
31
00-Lesson-Site/frontend/serve.json
Normal file
31
00-Lesson-Site/frontend/serve.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"headers": [
|
||||
{
|
||||
"source": "/_astro/**",
|
||||
"headers": [
|
||||
{
|
||||
"key": "Cache-Control",
|
||||
"value": "public, max-age=31536000, immutable"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": "/fonts/**",
|
||||
"headers": [
|
||||
{
|
||||
"key": "Cache-Control",
|
||||
"value": "public, max-age=31536000, immutable"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": "**",
|
||||
"headers": [
|
||||
{
|
||||
"key": "Cache-Control",
|
||||
"value": "public, max-age=0, must-revalidate"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -17,8 +17,6 @@ import styles from "./DarkModeToggle.module.scss";
|
||||
</button>
|
||||
|
||||
<script>
|
||||
// 1. Define the logic OUTSIDE the event listener.
|
||||
// This creates a stable reference that doesn't change on navigation.
|
||||
const handleToggleClick = () => {
|
||||
const root = document.documentElement;
|
||||
const isDark = root.classList.contains("dark");
|
||||
@ -39,9 +37,6 @@ import styles from "./DarkModeToggle.module.scss";
|
||||
const toggleBtn = document.getElementById("theme-toggle");
|
||||
|
||||
if (toggleBtn) {
|
||||
// 2. Now this actually works!
|
||||
// Since 'handleToggleClick' is the exact same function from step 1,
|
||||
// the browser successfully removes the old one before adding the new one.
|
||||
toggleBtn.removeEventListener("click", handleToggleClick);
|
||||
toggleBtn.addEventListener("click", handleToggleClick);
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
---
|
||||
// Path: src/components/Post/Blockquotes/Ganbatte.astro
|
||||
|
||||
import styles from "./Ganbatte.module.scss";
|
||||
|
||||
interface Props {
|
||||
toc?: string;
|
||||
tocLevel?: string;
|
||||
imageAlt?: string;
|
||||
}
|
||||
|
||||
const { toc, tocLevel = "1", imageAlt = "MangoPig Ganbatte" } = Astro.props;
|
||||
---
|
||||
|
||||
<blockquote class={styles.ganbatte} data-toc={toc} data-toc-level={tocLevel}>
|
||||
<slot />
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt={imageAlt} />
|
||||
</picture>
|
||||
</blockquote>
|
||||
@ -0,0 +1,41 @@
|
||||
/* Path: src/components/Post/Blockquotes/Ganbatte.module.scss */
|
||||
|
||||
.ganbatte {
|
||||
background-color: #fff45e1a;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
position: relative;
|
||||
min-height: 100px;
|
||||
|
||||
picture {
|
||||
position: absolute;
|
||||
bottom: -10px;
|
||||
right: -10px;
|
||||
|
||||
margin: 0;
|
||||
|
||||
width: 200px;
|
||||
max-width: 30%;
|
||||
|
||||
transform: rotate(10deg);
|
||||
|
||||
img {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
box-shadow: none;
|
||||
}
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style-type: disc;
|
||||
padding-left: 20px;
|
||||
margin-right: 220px;
|
||||
}
|
||||
|
||||
span {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 30px;
|
||||
transform: translateY(-50%);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
---
|
||||
// Path: src/components/Post/Blockquotes/Important.astro
|
||||
|
||||
import styles from "./Important.module.scss";
|
||||
---
|
||||
|
||||
<blockquote class={styles.important}>
|
||||
<slot />
|
||||
<picture class={styles.sticker}>
|
||||
<img src="https://pic.mangopig.tech/i/7eb5b343-5ddf-47ae-a272-4b82ca3d53d7.webp" alt="MangoPig Important" />
|
||||
</picture>
|
||||
</blockquote>
|
||||
@ -0,0 +1,55 @@
|
||||
/* Path: src/components/Post/Blockquotes/Important.module.scss */
|
||||
|
||||
.important {
|
||||
background-color: #ff5e5e33;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
position: relative;
|
||||
min-height: 100px;
|
||||
|
||||
font-weight: 500;
|
||||
|
||||
.sticker {
|
||||
position: absolute;
|
||||
bottom: 0px;
|
||||
right: -10px;
|
||||
|
||||
margin: 0;
|
||||
|
||||
width: 100px;
|
||||
max-width: 30%;
|
||||
|
||||
transform: rotate(10deg);
|
||||
|
||||
img {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
box-shadow: none;
|
||||
}
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style-type: disc;
|
||||
padding-left: 20px;
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
ol {
|
||||
list-style-type: decimal;
|
||||
margin-top: 20px;
|
||||
padding-left: 20px;
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
p {
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
span {
|
||||
// Place in middle vertically
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 30px;
|
||||
transform: translateY(-50%);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
---
|
||||
// Path: src/components/Post/Blockquotes/Info.astro
|
||||
|
||||
import styles from "./Info.module.scss";
|
||||
---
|
||||
|
||||
<blockquote class={styles.info}>
|
||||
<slot />
|
||||
<picture class={styles.sticker}>
|
||||
<img src="https://pic.mangopig.tech/i/ebf2e26a-8791-4277-90cb-079ad7454aef.webp" alt="MangoPig Ganbattte" />
|
||||
</picture>
|
||||
</blockquote>
|
||||
@ -0,0 +1,53 @@
|
||||
/* Path: src/components/Post/Blockquotes/Info.module.scss */
|
||||
|
||||
.info {
|
||||
background-color: #5efaff1a;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
position: relative;
|
||||
min-height: 100px;
|
||||
|
||||
.sticker {
|
||||
position: absolute;
|
||||
bottom: -10px;
|
||||
right: -10px;
|
||||
|
||||
margin: 0;
|
||||
|
||||
width: 100px;
|
||||
max-width: 30%;
|
||||
|
||||
transform: rotate(10deg);
|
||||
|
||||
img {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
box-shadow: none;
|
||||
}
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style-type: disc;
|
||||
padding-left: 20px;
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
ol {
|
||||
list-style-type: decimal;
|
||||
margin-top: 20px;
|
||||
padding-left: 20px;
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
p {
|
||||
margin-right: 100px;
|
||||
}
|
||||
|
||||
span {
|
||||
// Place in middle vertically
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 30px;
|
||||
transform: translateY(-50%);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
---
|
||||
// Path: src/components/Post/Blockquotes/QA.astro
|
||||
import styles from "./QA.module.scss";
|
||||
---
|
||||
|
||||
<div class={styles.qaContainer}>
|
||||
{/* The Question Section */}
|
||||
<div class={styles.questionHeader}>
|
||||
<span class={styles.prefix}>Q:</span>
|
||||
<span class={styles.questionText}>
|
||||
<slot name="question" />
|
||||
</span>
|
||||
</div>
|
||||
|
||||
{/* The Answer Section (Default Slot) */}
|
||||
<div class={styles.answerBody}>
|
||||
<span class={styles.prefix}>A:</span>
|
||||
<div class={styles.answerContent}>
|
||||
<slot />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* The Sticker */}
|
||||
<picture class={styles.sticker}>
|
||||
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt="Thinking MangoPig" />
|
||||
</picture>
|
||||
</div>
|
||||
@ -0,0 +1,78 @@
|
||||
/* Path: src/components/Post/Blockquotes/QA.module.scss */
|
||||
|
||||
.qaContainer {
|
||||
// Use a yellowish tint to differentiate from Info block
|
||||
background-color: #fff45e26;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
position: relative;
|
||||
min-height: 120px;
|
||||
margin-bottom: 20px;
|
||||
|
||||
// --- Sticker Logic (Same as Info block) ---
|
||||
.sticker {
|
||||
position: absolute;
|
||||
bottom: -10px;
|
||||
right: -10px;
|
||||
margin: 0;
|
||||
width: 100px;
|
||||
max-width: 30%;
|
||||
// Rotate opposite way for variety
|
||||
transform: rotate(-10deg);
|
||||
pointer-events: none;
|
||||
|
||||
img {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
box-shadow: none;
|
||||
}
|
||||
}
|
||||
|
||||
//Common prefix style (Q: and A:)
|
||||
.prefix {
|
||||
font-weight: 800;
|
||||
color: #ffbd72; // Matches your H2 color scheme
|
||||
margin-right: 12px;
|
||||
display: inline-block;
|
||||
min-width: 25px;
|
||||
}
|
||||
|
||||
// --- Question Section ---
|
||||
.questionHeader {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
margin-bottom: 20px;
|
||||
font-size: 1.1em;
|
||||
font-weight: 700;
|
||||
// Ensure text doesn't hit the sticker
|
||||
margin-right: 90px;
|
||||
color: color-adjust(primary, 0, 0);
|
||||
}
|
||||
|
||||
// --- Answer Section ---
|
||||
.answerBody {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
// Ensure text doesn't hit the sticker
|
||||
margin-right: 90px;
|
||||
}
|
||||
|
||||
.answerContent {
|
||||
flex: 1;
|
||||
line-height: 1.6;
|
||||
|
||||
// Handle standard markdown elements inside the answer slot
|
||||
p {
|
||||
margin-bottom: 1em;
|
||||
&:last-child {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
}
|
||||
|
||||
ul,
|
||||
ol {
|
||||
margin-bottom: 1em;
|
||||
padding-left: 20px;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -18,24 +18,41 @@ import styles from "./FloatingTOC.module.scss";
|
||||
const targets = content.querySelectorAll("[data-toc]");
|
||||
|
||||
targets.forEach((el, index) => {
|
||||
if (!el.id) el.id = `toc-item-${index}`;
|
||||
|
||||
const label = el.getAttribute("data-toc");
|
||||
const label = el.getAttribute("data-toc") || "";
|
||||
const level = el.getAttribute("data-toc-level") || "1";
|
||||
|
||||
// --- CHANGED SECTION START: ID Generation (Slugify) ---
|
||||
if (!el.id) {
|
||||
// Convert "Setting Up Developer Environment" -> "setting-up-developer-environment"
|
||||
const slug = label
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
.replace(/[^\w\s-]/g, "") // Remove non-word chars
|
||||
.replace(/[\s_-]+/g, "-") // Replace spaces with dashes
|
||||
.replace(/^-+|-+$/g, ""); // Trim dashes from start/end
|
||||
|
||||
// Safety check: If ID exists or slug is empty, fallback to index
|
||||
if (!slug || document.getElementById(slug)) {
|
||||
el.id = `section-${index}`;
|
||||
} else {
|
||||
el.id = slug;
|
||||
}
|
||||
}
|
||||
// --- CHANGED SECTION END ---
|
||||
|
||||
const li = document.createElement("li");
|
||||
const a = document.createElement("a");
|
||||
|
||||
// Add data attribute for CSS to target specific lengths
|
||||
li.setAttribute("data-level", level);
|
||||
|
||||
a.href = `#${el.id}`;
|
||||
// No Icon span, just the text which we will hide via CSS
|
||||
a.innerHTML = `<span class="toc-text">${label}</span>`;
|
||||
|
||||
a.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
el.scrollIntoView({ behavior: "smooth", block: "center" });
|
||||
// Optional: Update URL hash without jumping
|
||||
history.pushState(null, "", `#${el.id}`);
|
||||
});
|
||||
|
||||
li.appendChild(a);
|
||||
@ -45,14 +62,11 @@ import styles from "./FloatingTOC.module.scss";
|
||||
(entries) => {
|
||||
entries.forEach((entry) => {
|
||||
if (entry.isIntersecting) {
|
||||
// Clear all active classes
|
||||
document.querySelectorAll("#toc-list a").forEach((link) => link.classList.remove("active"));
|
||||
// Set current active
|
||||
a.classList.add("active");
|
||||
}
|
||||
});
|
||||
},
|
||||
// Tweak: Use a 1px line exactly in the vertical center of the screen
|
||||
{ rootMargin: "-50% 0px -50% 0px", threshold: 0 }
|
||||
);
|
||||
|
||||
@ -4,6 +4,20 @@
|
||||
border-left: 4px solid color-adjust(secondary, 0, 0);
|
||||
padding: 1rem;
|
||||
background: rgba(128, 128, 128, 0.1);
|
||||
|
||||
button {
|
||||
background: none;
|
||||
border: none;
|
||||
color: color-adjust(primary, 0, 0);
|
||||
cursor: pointer;
|
||||
font-weight: bold;
|
||||
padding: 0;
|
||||
text-decoration: underline;
|
||||
|
||||
&:hover {
|
||||
opacity: 0.8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.spoilerContent {
|
||||
732
00-Lesson-Site/frontend/src/content/lessons/01-intro.mdx
Normal file
732
00-Lesson-Site/frontend/src/content/lessons/01-intro.mdx
Normal file
@ -0,0 +1,732 @@
|
||||
---
|
||||
# Path: src/content/lessons/01-intro.mdx
|
||||
|
||||
title: "Introduction to Web Dev"
|
||||
description: "Setting up the environment"
|
||||
style: "type-1"
|
||||
---
|
||||
|
||||
{/* Blockquotes */}
|
||||
import Ganbatte from "../../components/Post/Blockquotes/Ganbatte.astro";
|
||||
import Important from "../../components/Post/Blockquotes/Important.astro";
|
||||
import Info from "../../components/Post/Blockquotes/Info.astro";
|
||||
import QA from "../../components/Post/Blockquotes/QA.astro";
|
||||
|
||||
import Spoiler from "../../components/Post/Spoiler.tsx";
|
||||
|
||||
# Hosting a Large Language Model (LLM) Locally
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/879aaccd-6822-423f-883a-74cf5ba598e7.jpg" alt="Web Development Illustration" />
|
||||
</picture>
|
||||
|
||||
<blockquote class="lesson-meta">
|
||||
<span>Lesson 01</span>
|
||||
<span>Created at: **December 2025**</span>
|
||||
<span>Last Updated: **December 2025**</span>
|
||||
</blockquote>
|
||||
|
||||
<Ganbatte toc="Lesson Objectives" tocLevel="1" imageAlt="MangoPig Ganbatte">
|
||||
## Lesson Objectives
|
||||
|
||||
- Setting up your Developer Environment
|
||||
- Setting up a isolated Docker environment for hosting LLMs
|
||||
- Fetching the AI model
|
||||
- Converting the model to GGUF format
|
||||
- Quantizing the model for better performance
|
||||
- Hosting a basic LLM model with llama.cpp locally
|
||||
- (To Be Added) Making a volume mount to persist LLM data across container restarts
|
||||
- (To Be Added) Tagging the Docker Image for future reuse
|
||||
|
||||
</Ganbatte>
|
||||
|
||||
<section data-toc="Setting Up Developer Environment" data-toc-level="1">
|
||||
<h2>Setting Up Your Developer Environment</h2>
|
||||
<section data-toc="WSL" data-toc-level="2">
|
||||
<h3>Setting Up WSL (Windows Subsystem for Linux)</h3>
|
||||
To set up WSL on your Windows machine, follow these steps:
|
||||
1. Open PowerShell as Administrator.
|
||||
2. Run the following command to enable WSL and install a Linux distribution (Ubuntu is recommended):
|
||||
|
||||
```zsh frame="none"
|
||||
wsl --install
|
||||
```
|
||||
|
||||
3. Restart your computer when prompted.
|
||||
4. After restarting, open the Ubuntu application from the Start menu and complete the initial setup by creating a user account.
|
||||
5. Update your package lists and upgrade installed packages by running:
|
||||
|
||||
```zsh frame="none"
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
```
|
||||
</section>
|
||||
|
||||
<section data-toc="ZSH" data-toc-level="2">
|
||||
<h3>Getting Your Environment Ready</h3>
|
||||
|
||||
```zsh frame="none"
|
||||
sudo apt install -y git make curl sudo zsh
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
mkdir -p ~/Config/Dotfiles
|
||||
git clone https://git.mangopig.tech/MangoPig/Dot-Zsh.git ~/Config/Dotfiles/Zsh
|
||||
cd ~/Config/Dotfiles/Zsh
|
||||
```
|
||||
|
||||
Whenever there's a prompt to ask to install just confirm with `y` and hit enter.
|
||||
|
||||
```zsh frame="none"
|
||||
make setup
|
||||
```
|
||||
|
||||
Restart the shell to finalize the zsh setup:
|
||||
|
||||
```zsh frame="none"
|
||||
zsh
|
||||
```
|
||||
|
||||
With the above commands, you should have a zsh environment, coding language and Docker setup. We will get more in details of all the tools with this setup as we work through the lessons.
|
||||
</section>
|
||||
|
||||
<section data-toc="Docker" data-toc-level="2">
|
||||
<h3>Installing Docker</h3>
|
||||
Docker should already be installed with the above steps. To verify, run:
|
||||
|
||||
```zsh frame="none"
|
||||
docker --version
|
||||
```
|
||||
and try to run a test container:
|
||||
|
||||
```zsh frame="none"
|
||||
docker run hello-world
|
||||
```
|
||||
|
||||
If you run into permissions issues, you may need to add your user to the docker group:
|
||||
|
||||
```zsh frame="none"
|
||||
sudo usermod -aG docker $USER
|
||||
```
|
||||
|
||||
Then restart the shell or log out and back in by doing:
|
||||
|
||||
```zsh frame="none"
|
||||
zsh
|
||||
```
|
||||
|
||||
</section>
|
||||
</section>
|
||||
|
||||
<section data-toc="Docker Environment Setup" data-toc-level="1">
|
||||
<h2>Setting Up the Isolated Docker Environment for Hosting LLMs</h2>
|
||||
Now that we have the local environment ready, we want to set up an isolated Docker environment for hosting LLMs so that it doesn't interfere with our main system.
|
||||
|
||||
<section data-toc="What is Docker?" data-toc-level="2">
|
||||
<h3>What is Docker?</h3>
|
||||
Docker is a platform that allows you to package your application and its dependencies into containers.
|
||||
|
||||
<Info>
|
||||
<span>You can find more Docker Images on <a href="https://hub.docker.com/">Docker Hub</a>.</span>
|
||||
</Info>
|
||||
|
||||
<section data-doc="Installing Docker" data-doc-level="3">
|
||||
<h4>Installing Docker</h4>
|
||||
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
||||
<section data-toc="Creating Docker Container" data-toc-level="2">
|
||||
<h3>Creating the Docker Container</h3>
|
||||
|
||||
For our current purpose, we will be using the official <a href="https://hub.docker.com/r/nvidia/cuda/tags">NVIDIA Docker image</a> so that we can leverage CUDA for GPU acceleration if available.
|
||||
|
||||
We will create the Docker container and make it interactive by running:
|
||||
|
||||
```zsh frame="none"
|
||||
docker run --gpus all -it --name llm-container -p 8080:8080 nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04 /bin/bash
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `--gpus` all enables GPU support for the container.
|
||||
- `--it` makes the container interactive, allowing you to run commands inside it.
|
||||
- `--name` llm-container gives the container a name for easier reference.
|
||||
- `-p 8080:8080` = `-p HOST:CONTAINER` maps port 8080 on your host machine to port 8080 inside the container. This is useful if you plan to run a server inside the container and want to access it from your host machine.
|
||||
- `nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04` specifies the Docker image to use.
|
||||
- `/bin/bash` starts a bash shell inside the container.
|
||||
</Info>
|
||||
|
||||
Once you are inside the container, you can proceed to setup the environment like we did before in the <a href="#setting-up-developer-environment">WSL section</a>.
|
||||
|
||||
<Info>
|
||||
There's a few things you need to do before you can setup the Environment like we did before:
|
||||
1. Update the package lists and install necessary packages:
|
||||
```zsh frame="none"
|
||||
apt update && apt install -y git make curl sudo zsh
|
||||
```
|
||||
|
||||
2. Remove the default user (usually `ubuntu`) to avoid permission issues:
|
||||
```zsh frame="none"
|
||||
userdel -r ubuntu
|
||||
```
|
||||
|
||||
3. Run my provisional script to setup users and permissions:
|
||||
```zsh frame="none"
|
||||
bash <(curl -s https://git.mangopig.tech/mangopig/Dot-Zsh/raw/branch/main/scripts/provision.sh)
|
||||
```
|
||||
You should create your own user when prompted, make it have 1000 as UID and GID for consistency and please remember the password you set here as you'll need it to use `sudo` later on.
|
||||
|
||||
4. Now change users by doing: **(replace `your-username` with the username you created)**
|
||||
```zsh frame="none"
|
||||
su - your-username
|
||||
```
|
||||
|
||||
OR you can exit the container and reattach with the new user by doing:
|
||||
```zsh frame="none"
|
||||
exit
|
||||
docker start llm-container
|
||||
docker exec -it --user your-username llm-container /bin/zsh
|
||||
```
|
||||
Press `q` when they prompt you to create a zsh configuration file.
|
||||
|
||||
5. Now you can proceed to setup zsh and the rest of the environment as shown in the [previous section](#zsh).
|
||||
|
||||
</Info>
|
||||
|
||||
Try to do this on your own first! If you get stuck, you can check the solution below.
|
||||
|
||||
<Spoiler client:idle >
|
||||
## Solution
|
||||
|
||||
1. Update the package lists and install necessary packages:
|
||||
```zsh frame="none"
|
||||
apt update && apt install -y git make curl sudo zsh
|
||||
```
|
||||
|
||||
2. Remove the default user (usually `ubuntu`) to avoid permission issues:
|
||||
```zsh frame="none"
|
||||
userdel -r ubuntu
|
||||
```
|
||||
|
||||
3. Run my provisional script to setup users and permissions:
|
||||
```zsh frame="none"
|
||||
bash <(curl -s https://git.mangopig.tech/mangopig/Dot-Zsh/raw/branch/main/scripts/provision.sh)
|
||||
```
|
||||
You should create your own user when prompted, make it have 1000 as UID and GID for consistency and please remember the password you set here as you'll need it to use `sudo` later on.
|
||||
|
||||
4. Now change users by doing: **(replace `your-username` with the username you created)**
|
||||
```zsh frame="none"
|
||||
su - your-username
|
||||
```
|
||||
|
||||
OR you can exit the container and reattach with the new user by doing:
|
||||
```zsh frame="none"
|
||||
exit
|
||||
docker start llm-container
|
||||
docker exec -it --user your-username llm-container /bin/zsh
|
||||
```
|
||||
Press `q` when they prompt you to create a zsh configuration file.
|
||||
|
||||
5. Go into the dotfiles directory and setup zsh:
|
||||
```zsh frame="none"
|
||||
cd ~/Config/Dot-Zsh
|
||||
make base && \
|
||||
make python && \
|
||||
make clean && \
|
||||
make stow
|
||||
```
|
||||
|
||||
6. Restart the shell to finalize the zsh setup:
|
||||
```zsh frame="none"
|
||||
zsh
|
||||
```
|
||||
|
||||
7. Verify that Pyenv and Miniforge is working by:
|
||||
```zsh frame="none"
|
||||
pyenv --version
|
||||
conda --version
|
||||
```
|
||||
</Spoiler>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
||||
<section data-toc="Python Setup" data-toc-level="1">
|
||||
<h2>Setting Up Python Environment</h2>
|
||||
Now that we have the Docker container set up, we can proceed to set up the environment to run llama.cpp inside the container.
|
||||
|
||||
We have setup `pyenv` and `Miniforge` as part of the zsh setup. You can verify that they are working by running:
|
||||
|
||||
```zsh frame="none"
|
||||
pyenv --version
|
||||
conda --version
|
||||
```
|
||||
|
||||
`pyenv` allows us to manage multiple Python versions easily. We can easily install different versions of Python and Conda environments as needed for different projects.
|
||||
|
||||
`conda` (via Miniforge) allows us to create isolated Python environments, which is helpful for making sure that the dependencies for llama.cpp do not interfere with other projects.
|
||||
|
||||
Let's first create a directory for llama.cpp and navigate into it:
|
||||
|
||||
```zsh frame="none"
|
||||
mkdir -p ~/Projects/llama.cpp
|
||||
cd ~/Projects/llama.cpp
|
||||
```
|
||||
|
||||
Now, let's clone the llama.cpp repository:
|
||||
|
||||
```zsh frame="none"
|
||||
git clone https://github.com/ggerganov/llama.cpp.git .
|
||||
```
|
||||
|
||||
<Info>
|
||||
- You can also the contents of the repository with `ls -la`
|
||||
- The `.` at the end of the git clone command ensures that the contents of the repository are cloned directly into the current directory.
|
||||
- For convenience, you can find the official llama.cpp repository at <a href="https://github.com/ggml-org/llama.cpp?tab=readme-ov-file">llama.cpp GitHub</a>
|
||||
</Info>
|
||||
|
||||
With the repository cloned, we can now proceed to build the llama.cpp.
|
||||
|
||||
We first use `cmake` to configure the build system. It's like telling the app what our computer environment looks like and what options we want to enable.
|
||||
|
||||
```zsh frame="none"
|
||||
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `-S .` tells cmake where to find the source files (in this case, the current directory).
|
||||
- `-B build` specifies where all the temperary build files will go (in a folder named `build`).
|
||||
- `-G Ninja` tells cmake to use the Ninja build system.
|
||||
- `-DCMAKE_BUILD_TYPE=Release` sets the build type to Release for optimized performance.
|
||||
- `-DCMAKE_INSTALL_PREFIX=/your/install/dir` specifies where to install the built files. You can change this to your desired installation path.
|
||||
- `-DLLAMA_BUILD_TESTS=OFF` disables building tests.
|
||||
- `-DLLAMA_BUILD_EXAMPLES=ON` enables building example programs.
|
||||
- `-DLLAMA_BUILD_SERVER=ON` enables building the server component.
|
||||
</Info>
|
||||
|
||||
Now we can build the project, this step is basically taking what we told cmake to do and actually making it into executable files.
|
||||
|
||||
```zsh frame="none"
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `--build build` tells cmake to build the project using the files in the `build` directory. (where we set with -B in the previous step)
|
||||
- `--config Release` specifies that we want to build the Release version.
|
||||
- `-j $(nproc)` tells cmake to use all available CPU cores for faster building.
|
||||
- `$(nproc)` is a command that returns the number of processing units available.
|
||||
</Info>
|
||||
|
||||
After we are doing building, the binaries will be located in the `build/bin` directory. We want to move it to a more accessible location (`/usr/local` that we specified earlier), so we can run it easily. We can do this by running:
|
||||
|
||||
```zsh frame="none"
|
||||
sudo cmake --install build && \
|
||||
sudo ldconfig
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `--install build` tells cmake to install the built files from the `build` directory to the location we specified earlier with `-DCMAKE_INSTALL_PREFIX`.
|
||||
- `sudo ldconfig` updates the system's library cache to recognize the newly installed binaries.
|
||||
</Info>
|
||||
|
||||
Now you should be able to run the `llama.cpp` binary from anywhere, you can check what llama.cpp options are available by running:
|
||||
|
||||
```zsh frame="none"
|
||||
ls /usr/local/bin
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
bat llama-eval-callback llama-lookup llama-save-load-state
|
||||
convert_hf_to_gguf.py llama-export-lora llama-lookup-create llama-server
|
||||
fd llama-finetune llama-lookup-merge llama-simple
|
||||
llama-batched llama-gen-docs llama-lookup-stats llama-simple-chat
|
||||
llama-batched-bench llama-gguf llama-mtmd-cli llama-speculative
|
||||
llama-bench llama-gguf-hash llama-parallel llama-speculative-simple
|
||||
llama-cli llama-gguf-split llama-passkey llama-tokenize
|
||||
llama-convert-llama2c-to-ggml llama-idle llama-perplexity llama-tts
|
||||
llama-cvector-generator llama-imatrix llama-quantize
|
||||
llama-diffusion-cli llama-logits llama-retrieval
|
||||
llama-embedding llama-lookahead llama-run
|
||||
```
|
||||
|
||||
We can further verify whether we can run `llama.cpp` by checking its version:
|
||||
|
||||
```zsh frame="none"
|
||||
llama-cli --version
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
version: 7327 (c8554b66e)
|
||||
built with GNU 13.3.0 for Linux x86_64
|
||||
```
|
||||
|
||||
</section>
|
||||
|
||||
<section data-toc="Getting the AI" data-toc-level="1">
|
||||
<h2>Fetching the AI Model Weights</h2>
|
||||
Now that we have llama.cpp set up, we need to get some AI models to run with it.
|
||||
The main place to get models is from [Hugging Face](https://huggingface.co/). You will need to create an account if you don't have one already.
|
||||
Once you have created an account, you should also setup your access token by going:
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/aea54c8e-9dd5-44c7-ab1f-6b57b076e7d8.webp" alt="Hugging Face Access Token" />
|
||||
</picture>
|
||||
|
||||
And then give your token all the `read` permissions.
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/4360ee94-7f37-4897-91e9-882fd198b8b3.webp" alt="Hugging Face Token Permissions" />
|
||||
</picture>
|
||||
|
||||
<Important>
|
||||
Make sure to copy the token somewhere safe and **DO NOT SHARE IT WITH ANYONE** or **USE IT DIRECTLY IN PUBLIC REPOSITORIES** and **DIRECTLY IN YOUR CODE**! Consult AIs on how to keep your tokens safe if you are unsure, but do not directly share them with the AI.
|
||||
</Important>
|
||||
|
||||
Now that you have your token, you can use it to download models from Hugging Face. We will use `huggingface-cli` to do this. Let's first make the directory to store the models:
|
||||
|
||||
```zsh frame="none"
|
||||
mkdir -p ~/Models
|
||||
cd ~/Models
|
||||
```
|
||||
|
||||
We can then install `huggingface-cli`
|
||||
|
||||
```zsh frame="none"
|
||||
curl -LsSf https://hf.co/cli/install.sh | bash
|
||||
```
|
||||
|
||||
We will then login to Hugging Face using the CLI and provide our access token when prompted:
|
||||
|
||||
```zsh frame="none"
|
||||
git config --global credential.helper store
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
hf auth login
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
|
||||
To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
|
||||
Enter your token (input will not be visible): INPUT_YOUR_TOKEN_HERE
|
||||
Add token as git credential? [y/N]: y
|
||||
Token is valid (permission: fineGrained).
|
||||
The token `temp` has been saved to /home/mangopig/.cache/huggingface/stored_tokens
|
||||
Your token has been saved in your configured git credential helpers (store).
|
||||
Your token has been saved to /home/mangopig/.cache/huggingface/token
|
||||
Login successful.
|
||||
The current active token is: `temp`
|
||||
```
|
||||
|
||||
Now you can download models using the `hf download` command. I will be using the [`SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B) following this tutorial but if the model is too large for your system, you can choose a smaller model from Hugging Face, such as [`SmolLM2-1.7B`](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B) or [`SmolLM2-360M`](https://huggingface.co/HuggingFaceTB/SmolLM2-360M).
|
||||
|
||||
```zsh frame="none"
|
||||
hf download HuggingFaceTB/SmolLM3-3B --local-dir ~/Models/SmolLM3-3B
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `HuggingFaceTB/SmolLM3-3B` is the model identifier on Hugging Face. Get it from clicking the button to copy the name in the image below:
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/674714b4-736b-429c-b198-c9d57ba8bdee.webp" alt="Hugging Face Model Page" />
|
||||
</picture>
|
||||
- `--local-dir ~/Models/SmolLM3-3B` specifies where to save the downloaded model.
|
||||
|
||||
You can find out more about what options you can use with `hf download` by doing `hf download --help`.
|
||||
|
||||
```zsh frame="none"
|
||||
> hf download --help
|
||||
|
||||
Usage: hf download [OPTIONS] REPO_ID [FILENAMES]...
|
||||
|
||||
Download files from the Hub.
|
||||
|
||||
Arguments:
|
||||
REPO_ID The ID of the repo (e.g. `username/repo-name`). [required]
|
||||
[FILENAMES]... Files to download (e.g. `config.json`,
|
||||
`data/metadata.jsonl`).
|
||||
|
||||
Options:
|
||||
--repo-type [model|dataset|space]
|
||||
The type of repository (model, dataset, or
|
||||
space). [default: model]
|
||||
--revision TEXT Git revision id which can be a branch name,
|
||||
a tag, or a commit hash.
|
||||
--include TEXT Glob patterns to include from files to
|
||||
download. eg: *.json
|
||||
--exclude TEXT Glob patterns to exclude from files to
|
||||
download.
|
||||
--cache-dir TEXT Directory where to save files.
|
||||
--local-dir TEXT If set, the downloaded file will be placed
|
||||
under this directory. Check out https://hugg
|
||||
ingface.co/docs/huggingface_hub/guides/downl
|
||||
oad#download-files-to-local-folder for more
|
||||
details.
|
||||
--force-download / --no-force-download
|
||||
If True, the files will be downloaded even
|
||||
if they are already cached. [default: no-
|
||||
force-download]
|
||||
--dry-run / --no-dry-run If True, perform a dry run without actually
|
||||
downloading the file. [default: no-dry-run]
|
||||
--token TEXT A User Access Token generated from
|
||||
https://huggingface.co/settings/tokens.
|
||||
--quiet / --no-quiet If True, progress bars are disabled and only
|
||||
the path to the download files is printed.
|
||||
[default: no-quiet]
|
||||
--max-workers INTEGER Maximum number of workers to use for
|
||||
downloading files. Default is 8. [default:
|
||||
8]
|
||||
--help Show this message and exit.
|
||||
```
|
||||
</Info>
|
||||
|
||||
With this, we have a model downloaded at `~/Models/SmolLM3-3B`. We can now proceed to try to run the model with llama.cpp.
|
||||
|
||||
</section>
|
||||
|
||||
<section data-toc="Converting Model to GGUF" data-toc-level="1">
|
||||
<h2>Converting the Model to GGUF</h2>
|
||||
<p>After downloading the model from Hugging Face, we need to convert it to the GGUF format so that llama.cpp can use it.</p>
|
||||
<p>Hugging Face usually store their models in the `.safetensors` format</p>
|
||||
<p>However, `llama.cpp` usually expect the models to be in the `.gguf` format.</p>
|
||||
<p>So we will need to convert the models to `.gguf`. Luckily, `llama.cpp` comes with a python script that helps us to do just that.</p>
|
||||
<p>We will first create a `Python` environment with `Conda` and activate it</p>
|
||||
|
||||
```zsh frame="none"
|
||||
conda create -n llama-cpp python=3.10 -y
|
||||
conda activate llama-cpp
|
||||
python -m pip install --upgrade pip wheel setuptools
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `conda create -n llama-cpp python=3.10 -y` creates a new conda environment named `llama-cpp` with Python 3.10 installed
|
||||
- `-n`: Specifies the name of the environment.
|
||||
- `python=3.10`: Specifies the Python version to install in the environment.
|
||||
- `-y`: Automatically confirms the creation.
|
||||
- `conda activate llama-cpp` activates the newly created conda environment.
|
||||
- `python -m pip install --upgrade pip wheel setuptools`
|
||||
- We are updating `pip`, `wheel`, and `setuptools`
|
||||
- `pip`: The package installer for Python. Similar to `npm` and `go get` in other languages.
|
||||
- `wheel`: A built-package format for Python.
|
||||
- `setuptools`: A package development and distribution library for Python.
|
||||
</Info>
|
||||
|
||||
<p>`conda` is used to isolate the dependencies needed for the conversion process so that it doesn't interfere with other projects.</p>
|
||||
<p>We will then install the dependencies for `llama.cpp`</p>
|
||||
|
||||
```zsh frame="none"
|
||||
pip install --upgrade -r ~/Projects/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `pip install`: Installs Python packages.
|
||||
- `--upgrade`: Upgrades the packages to the latest versions.
|
||||
- `-r`: Specifies that we are installing packages from a requirements file.
|
||||
- `~/Projects/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt`: The path to the requirements file that contains the list of packages needed for converting models to GGUF format.
|
||||
</Info>
|
||||
|
||||
Nice! Now we are ready to convert the model to GGUF format. We can do this by running the conversion script provided by `llama.cpp`
|
||||
|
||||
```zsh frame="none"
|
||||
python ~/Projects/llama.cpp/convert_hf_to_gguf.py \
|
||||
~/Models/SmolLM3-3B \
|
||||
--outfile ~/Models/SmolLM3-3B/SmolLM3-3B.gguf
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `python ~/Projects/llama.cpp/convert_hf_to_gguf.py`: `python` runs the conversion script located at `~/Projects/llama.cpp/scripts/convert_hf_to_gguf.py`.
|
||||
- `~/Models/SmolLM3-3B`: Specifies the path to the downloaded model in Hugging Face format.
|
||||
- `--outfile ~/Models/SmolLM3-3B/SmolLM3-3B.gguf`: Specifies where to save the converted model in GGUF format.
|
||||
</Info>
|
||||
|
||||
When you see a similar output to:
|
||||
|
||||
```zsh frame="none"
|
||||
INFO:hf-to-gguf:Model successfully exported to SmolLM3-3B.gguf
|
||||
```
|
||||
|
||||
Then you have succeeded in converting the model to GGUF format!
|
||||
</section>
|
||||
|
||||
<section data-toc="Quantizing the Model" data-toc-level="1">
|
||||
<h2>Quantizing the Model for Better Performance</h2>
|
||||
<p>Quantization is a technique used to reduce the size of the model and improve inference speed and VRAM requirements by compressing and reducing the model's weight</p>
|
||||
|
||||
We can learn what quantization `llama.cpp` supports by running:
|
||||
|
||||
```zsh frame="none"
|
||||
llama-quantize --help
|
||||
```
|
||||
|
||||
```zsh frame="none"
|
||||
usage: llama-quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]
|
||||
[--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]
|
||||
model-f32.gguf [model-quant.gguf] type [nthreads]
|
||||
|
||||
--allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
|
||||
--leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
|
||||
--pure: Disable k-quant mixtures and quantize all tensors to the same type
|
||||
--imatrix file_name: use data in file_name as importance matrix for quant optimizations
|
||||
--include-weights tensor_name: use importance matrix for this/these tensor(s)
|
||||
--exclude-weights tensor_name: use importance matrix for this/these tensor(s)
|
||||
--output-tensor-type ggml_type: use this ggml_type for the output.weight tensor
|
||||
--token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor
|
||||
--tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0
|
||||
Advanced option to selectively quantize tensors. May be specified multiple times.
|
||||
--prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model
|
||||
Advanced option to remove all tensors from the given layers
|
||||
--keep-split: will generate quantized model in the same shards as input
|
||||
--override-kv KEY=TYPE:VALUE
|
||||
Advanced option to override model metadata by key in the quantized model. May be specified multiple times.
|
||||
Note: --include-weights and --exclude-weights cannot be used together
|
||||
|
||||
Allowed quantization types:
|
||||
2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B
|
||||
3 or Q4_1 : 4.78G, +0.4511 ppl @ Llama-3-8B
|
||||
38 or MXFP4_MOE : MXFP4 MoE
|
||||
8 or Q5_0 : 5.21G, +0.1316 ppl @ Llama-3-8B
|
||||
9 or Q5_1 : 5.65G, +0.1062 ppl @ Llama-3-8B
|
||||
19 or IQ2_XXS : 2.06 bpw quantization
|
||||
20 or IQ2_XS : 2.31 bpw quantization
|
||||
28 or IQ2_S : 2.5 bpw quantization
|
||||
29 or IQ2_M : 2.7 bpw quantization
|
||||
24 or IQ1_S : 1.56 bpw quantization
|
||||
31 or IQ1_M : 1.75 bpw quantization
|
||||
36 or TQ1_0 : 1.69 bpw ternarization
|
||||
37 or TQ2_0 : 2.06 bpw ternarization
|
||||
10 or Q2_K : 2.96G, +3.5199 ppl @ Llama-3-8B
|
||||
21 or Q2_K_S : 2.96G, +3.1836 ppl @ Llama-3-8B
|
||||
23 or IQ3_XXS : 3.06 bpw quantization
|
||||
26 or IQ3_S : 3.44 bpw quantization
|
||||
27 or IQ3_M : 3.66 bpw quantization mix
|
||||
12 or Q3_K : alias for Q3_K_M
|
||||
22 or IQ3_XS : 3.3 bpw quantization
|
||||
11 or Q3_K_S : 3.41G, +1.6321 ppl @ Llama-3-8B
|
||||
12 or Q3_K_M : 3.74G, +0.6569 ppl @ Llama-3-8B
|
||||
13 or Q3_K_L : 4.03G, +0.5562 ppl @ Llama-3-8B
|
||||
25 or IQ4_NL : 4.50 bpw non-linear quantization
|
||||
30 or IQ4_XS : 4.25 bpw non-linear quantization
|
||||
15 or Q4_K : alias for Q4_K_M
|
||||
14 or Q4_K_S : 4.37G, +0.2689 ppl @ Llama-3-8B
|
||||
15 or Q4_K_M : 4.58G, +0.1754 ppl @ Llama-3-8B
|
||||
17 or Q5_K : alias for Q5_K_M
|
||||
16 or Q5_K_S : 5.21G, +0.1049 ppl @ Llama-3-8B
|
||||
17 or Q5_K_M : 5.33G, +0.0569 ppl @ Llama-3-8B
|
||||
18 or Q6_K : 6.14G, +0.0217 ppl @ Llama-3-8B
|
||||
7 or Q8_0 : 7.96G, +0.0026 ppl @ Llama-3-8B
|
||||
1 or F16 : 14.00G, +0.0020 ppl @ Mistral-7B
|
||||
32 or BF16 : 14.00G, -0.0050 ppl @ Mistral-7B
|
||||
0 or F32 : 26.00G @ 7B
|
||||
COPY : only copy tensors, no quantizing
|
||||
```
|
||||
|
||||
<Info>
|
||||
For a line `2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B`
|
||||
- `2` and `Q4_0` are the identifiers you can use to specify the quantization type.
|
||||
- `4.34G` indicates the size of the quantized model.
|
||||
- `+0.4685 ppl` indicates the increase in perplexity (a measure of model performance; lower is better) when using this quantization type
|
||||
</Info>
|
||||
|
||||
<QA>
|
||||
<span slot="question">How do I know how big of a model size can I fit in my computer</span>
|
||||
<p>It depends on whether you are running inference on your <strong>CPU (System RAM)</strong> or <strong>GPU (VRAM)</strong>.</p>
|
||||
|
||||
<p>For CPU inference, you generally want the model size to be around 2x the size of your system RAM for comfortable operation. For example, if you have 16GB of RAM, you should aim for models that are around 8GB or smaller.</p>
|
||||
|
||||
**Size (GB) ≈ (Parameters (Billions) × Bits Per Weight) / 8 + Overhead**
|
||||
|
||||
- Bits Per Weight (bpw):
|
||||
- Qx = x bits per weight
|
||||
- Qx_K = K quants will keep some important weights at higher precision (Q4_K ≈ 5 bits per weight, Q5_K ≈ 6 bits per weight, Q6_K ≈ 7 bits per weight)
|
||||
- Qx_K_S = Small K quants
|
||||
- Qx_K_M = Medium K quants
|
||||
- Qx_K_L = Large K quants
|
||||
- IQx = Integer Quantization with x bits per weight, bpw is on the chart
|
||||
- TQx = Ternary Quantization with x bits per weight, bpw is on the chart
|
||||
</QA>
|
||||
|
||||
**TO BE ADDED** - Quantization Calculator
|
||||
|
||||
Once we have decided what quantization type to use, we can proceed to quantize the model by running:
|
||||
|
||||
```zsh frame="none"
|
||||
llama-quantize \
|
||||
~/Models/SmolLM3-3B/SmolLM3-3B.gguf \
|
||||
~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf \
|
||||
q4_0
|
||||
4
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `llama-quantize`: The command to run the quantization process.
|
||||
- `~/Models/SmolLM3-3B/SmolLM3-3B.gguf`: The path to the original GGUF model that we want to quantize.
|
||||
- `~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf`: The path where we want to save the quantized model.
|
||||
- `q4_0`: The quantization type we want to use (in this case, Q4_0).
|
||||
- `4`: Number of threads to use for quantization (optional, defaults to number of CPU cores).
|
||||
</Info>
|
||||
|
||||
<p>After the quantization is complete, you should see a new file named `SmolLM3-3B.q4.gguf` in the model directory.</p>
|
||||
<p>We can now learn how to serve the model with `llama.cpp`</p>
|
||||
|
||||
</section>
|
||||
|
||||
<section data-toc="Inferencing the Model" data-toc-level="1">
|
||||
<h2>Inferencing the Model</h2>
|
||||
<p>Now that we have the model ready, we can proceed to run inference with it using `llama.cpp`.</p>
|
||||
<p>`llama.cpp` provides us with multiple ways of inferencing, we can: </p>
|
||||
- Use the command line interface (CLI) to interact with the model directly from the terminal. (llama-cli)
|
||||
- Use the server mode to host the model and interact with it via HTTP requests. (llama-server)
|
||||
|
||||
For this tutorial, we will use the `llama-server` to serve the model.
|
||||
|
||||
To start the server with our quantized model, we can run:
|
||||
|
||||
```zsh frame="none"
|
||||
llama-server \
|
||||
--model ~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf \
|
||||
--host 0.0.0.0 \
|
||||
--port 8080
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `llama-server`: The command to start the server.
|
||||
- `--model ~/Models/SmolLM3-3B/SmolLM3-3B.q4.gguf`: Specifies the path to the quantized model we want to serve.
|
||||
- `--host 0.0.0.0`: This makes the server accessible from any IP address.
|
||||
- `--port 8080`: Specifies the port on which the server will listen for incoming requests.
|
||||
You can read all the options you can customize to run the server [here](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
|
||||
</Info>
|
||||
|
||||
As soon as you see this
|
||||
|
||||
```zsh frame="none"
|
||||
main: model loaded
|
||||
main: server is listening on http://0.0.0.0:8080
|
||||
main: starting the main loop...
|
||||
```
|
||||
|
||||
Your server is up and running! You can now interact with the model by going to [`http://localhost:8080`](http://localhost:8080) in your web browser or using tools like `curl` for API requests.
|
||||
|
||||
Open another terminal window and use this example for API request using `curl`:
|
||||
|
||||
```zsh frame="none"
|
||||
curl \
|
||||
--request POST \
|
||||
--url http://localhost:8080/completion \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
<Info>
|
||||
- `--request POST`: Specifies that we are making a POST request. (We will get into REST HTTP APIs in future tutorials)
|
||||
- `--url http://localhost:8080/completion`: The URL of the server endpoint for completions.
|
||||
- `--header "Content-Type: application/json"`: Sets the content type to JSON.
|
||||
- `--data '{...}'`: The JSON payload containing the prompt and other parameters for the model.
|
||||
|
||||
Read more about the API requests [here](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#using-with-curl)
|
||||
</Info>
|
||||
|
||||
</section>
|
||||
@ -36,6 +36,25 @@ h2 {
|
||||
color: #ffbd72;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size: 1.75em;
|
||||
color: #aad4fc;
|
||||
}
|
||||
|
||||
h4 {
|
||||
font-size: 1.5em;
|
||||
color: #9dffb8;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #ffb8b8;
|
||||
text-decoration: underline;
|
||||
|
||||
&:hover {
|
||||
color: #ff5e5e;
|
||||
}
|
||||
}
|
||||
|
||||
picture {
|
||||
display: block;
|
||||
margin: 20px auto;
|
||||
@ -53,6 +72,10 @@ blockquote {
|
||||
line-height: 1.6;
|
||||
margin-bottom: 15px;
|
||||
text-align: justify;
|
||||
|
||||
p {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
}
|
||||
|
||||
strong {
|
||||
@ -64,35 +87,10 @@ strong {
|
||||
margin: 20px !important;
|
||||
}
|
||||
|
||||
.objectives {
|
||||
background-color: #fff45e1a;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
|
||||
position: relative;
|
||||
min-height: 100px;
|
||||
|
||||
picture {
|
||||
position: absolute;
|
||||
bottom: -10px;
|
||||
right: -10px;
|
||||
|
||||
margin: 0;
|
||||
|
||||
width: 140px;
|
||||
max-width: 30%;
|
||||
|
||||
transform: rotate(10deg);
|
||||
|
||||
img {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
box-shadow: none;
|
||||
}
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style-type: disc;
|
||||
padding-left: 20px;
|
||||
}
|
||||
code {
|
||||
background-color: color-adjust(background, 0.02, 0.02);
|
||||
color: color-adjust(primary, 0, 0);
|
||||
padding: 2px 4px;
|
||||
border-radius: 4px;
|
||||
font-family: "GeistMono", monospace;
|
||||
}
|
||||
@ -1,61 +0,0 @@
|
||||
---
|
||||
# Path: src/content/lessons/01-intro.mdx
|
||||
|
||||
title: "Introduction to Web Dev"
|
||||
description: "Setting up the environment"
|
||||
style: "type-1"
|
||||
---
|
||||
|
||||
import Spoiler from "../../components/Post/Spoiler.tsx";
|
||||
|
||||
# Hosting a Large Language Model (LLM) Locally
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/879aaccd-6822-423f-883a-74cf5ba598e7.jpg" alt="Web Development Illustration" />
|
||||
</picture>
|
||||
|
||||
<blockquote class="lesson-meta">
|
||||
<span>Lesson 01</span>
|
||||
<span>Created at: **December 2025**</span>
|
||||
<span>Last Updated: **December 2025**</span>
|
||||
</blockquote>
|
||||
|
||||
<blockquote class="objectives" data-toc="Lesson Objectives">
|
||||
|
||||
## Lesson Objectives
|
||||
|
||||
- Setting up your Developer Environment
|
||||
- Setting up a isolated Docker environment for hosting LLMs
|
||||
- Introduction to basic Python environment setup
|
||||
- Hosting a basic LLM model with Llama.cpp locally
|
||||
|
||||
<picture>
|
||||
<img src="https://pic.mangopig.tech/i/4c4d1b5f-b9ce-4952-a1b4-991b19c0adb5.png" alt="MangoPig Ganbattte" />
|
||||
</picture>
|
||||
|
||||
</blockquote>
|
||||
|
||||
<section data-toc="Setting Up Developer Environment">
|
||||
<h2 data-toc="WSL" data-toc-level="2">Setting Up WSL (Windows Subsystem for Linux)</h2>
|
||||
To set up WSL on your Windows machine, follow these steps:
|
||||
1. Open PowerShell as Administrator.
|
||||
2. Run the following command to enable WSL and install a Linux distribution (Ubuntu is recommended):
|
||||
|
||||
```zsh frame="none"
|
||||
wsl --install
|
||||
```
|
||||
|
||||
3. Restart your computer when prompted.
|
||||
4. After restarting, open the Ubuntu application from the Start menu and complete the initial setup by creating a user account.
|
||||
5. Update your package lists and upgrade installed packages by running:
|
||||
|
||||
```zsh frame="none"
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
```
|
||||
|
||||
<h2 data-toc="Getting Your Environment Ready" data-toc-level="2">Getting Your Environment Ready</h2>
|
||||
|
||||
```wsl frame="none"
|
||||
|
||||
|
||||
</section>
|
||||
Loading…
x
Reference in New Issue
Block a user