HuggingFaceからGemma3nをダウンロードしてllama.cppで動かす（Google Colab📒ノートブック付）

HuggingFaceからGemma3nの修正済みGGUFファイルをダウンロードして、llama.cppで実行する効率的な方法です。

GraphGen JP ✖ gemma3n でファインチューニングしてみる7⃣
無事に、Unsloth版のGemma3nをダウンロードしてllama.cppで動かせた！！！ https://t.co/FuQkaavPya pic.twitter.com/1Pff68BW3q

— Maki@Sunwood AI Labs. (@hAru_mAki_ch) July 12, 2025

⚠️ 重要：なぜunsloth版を使う必要があるのか

通常のGoogle公式Gemma-3n-E4Bは、llama.cppで正常に動作しません！ Unslothチームが複数の重要なバグを発見・修正し、llama.cppで動作するように最適化したGGUFファイルを提供しています。

Unslothの修正内容:

チャットテンプレートとトークナイザーの修正
RoPEスケーリングの修正
未初期化ウェイトの問題の特定と修正
BOSトークンの適切な処理
最適な推論パラメータの設定

UnslothはHugging FaceのXuan-Son Nguyenとllama.cppチームのGeorgi Gerganovと協力してGemma 3nをllama.cppで動作させました。

環境準備

# Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

llama.cppのセットアップ（改良版）

# 既存のビルドがあるかチェック
import os
import shutil

def setup_llama_cpp():
    # 既存のビルド済みファイルがあるかチェック
    drive_build_path = "/content/drive/MyDrive/llama_cpp_build.tar.gz"

    if os.path.exists(drive_build_path):
        print("&#x1f504; 既存のビルドファイルを発見しました。復元中...")

        # llama.cppをクローン（軽量）
        if not os.path.exists('/content/llama.cpp'):
            os.system('git clone https://github.com/ggerganov/llama.cpp.git /content/llama.cpp')

        # ビルド済みファイルを復元
        os.chdir('/content/llama.cpp')
        os.system(f'tar -xzf {drive_build_path}')

        print("&#x2705; ビルドファイルの復元が完了しました。")
        return True
    else:
        print("&#x1f528; 新規ビルドを開始します...")
        return False

# セットアップ実行
if not setup_llama_cpp():
    # 新規ビルドの場合
    print("新規ビルドを実行中...")

    # llama.cppをクローンしてビルド
    if not os.path.exists('/content/llama.cpp'):
        os.system('git clone https://github.com/ggerganov/llama.cpp.git /content/llama.cpp')

    os.chdir('/content/llama.cpp')

    # CMakeでCUDAサポートビルド
    os.system('cmake -B build -DGGML_CUDA=ON')
    os.system('cmake --build build --config Release -j$(nproc)')

    # ビルド結果を保存
    print("&#x1f4be; ビルドファイルを保存中...")
    os.system('tar -czf llama_cpp_build.tar.gz build/bin/ $(find build -name "*.so" -o -name "*.a" 2>/dev/null)')
    os.system('cp llama_cpp_build.tar.gz /content/drive/MyDrive/')

    print("&#x2705; ビルドが完了し、Driveに保存されました。")
else:
    os.chdir('/content/llama.cpp')

setup_llama_cpp()

Gemma3nモデルのダウンロード（重要：修正済み版）

# HuggingFace経由でUnslothの修正済みGGUFをダウンロード
# &#x26a0;&#xfe0f; 重要：Google公式版ではなく、必ずUnsloth修正版を使用してください！
import os

def download_gemma3n_model():
    # モデルファイルのパス
    model_e4b_path = "/content/gemma3n-e4b-fixed.gguf"
    model_e2b_path = "/content/gemma3n-e2b-fixed.gguf"

    # 既存ファイルをチェック
    if os.path.exists(model_e4b_path):
        print(f"&#x2705; E4B版修正済みモデルが既に存在します: {model_e4b_path}")
        return model_e4b_path
    elif os.path.exists(model_e2b_path):
        print(f"&#x2705; E2B版修正済みモデルが既に存在します: {model_e2b_path}")
        return model_e2b_path

    print("&#x1f4e5; Gemma3n修正済みモデルをダウンロード中...")
    print("&#x1f527; UnslothによるバグFIX版をダウンロード中...")

    # huggingface_hubをインストール
    os.system('pip install huggingface_hub')

    # デフォルトでE4B版をダウンロード（高品質版）
    # これはUnslothが修正したllama.cpp対応版です！
    download_url = "https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF/resolve/main/gemma-3n-E4B-it-UD-Q4_K_XL.gguf"
    output_path = model_e4b_path

    print(f"&#x1f504; ダウンロード中: Gemma3n E4B版（Unsloth修正済み）(約7.5GB)")
    print("   これで通常のgemma-3n-E4Bはllama.cppで動かないが、これなら動きます！")
    os.system(f'wget -O {output_path} {download_url}')

    if os.path.exists(output_path):
        print(f"&#x2705; 修正済みモデルのダウンロード完了: {output_path}")
        print("&#x1f389; このモデルはllama.cppで正常に動作します！")
        return output_path
    else:
        print("&#x274c; ダウンロードに失敗しました。")
        return None

# モデルをダウンロード
selected_model = download_gemma3n_model()

# 軽量版が必要な場合のオプション（こちらも修正済み版）
def download_gemma3n_light():
    model_path = "/content/gemma3n-e2b-fixed.gguf"

    if os.path.exists(model_path):
        print(f"&#x2705; 軽量版修正済みモデルが既に存在します: {model_path}")
        return model_path

    print("&#x1f4e5; Gemma3n軽量版（修正済み）をダウンロード中...")
    print("&#x1f527; こちらもUnslothによるllama.cpp対応版です！")

    # Unsloth修正済みE2B版をダウンロード
    download_url = "https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-UD-Q4_K_XL.gguf"

    print(f"&#x1f504; ダウンロード中: Gemma3n E2B版（Unsloth修正済み）(約5.6GB)")
    os.system(f'wget -O {model_path} {download_url}')

    if os.path.exists(model_path):
        print(f"&#x2705; 軽量版修正済みモデルのダウンロード完了: {model_path}")
        print("&#x1f389; このモデルもllama.cppで正常に動作します！")
        return model_path
    else:
        print("&#x274c; ダウンロードに失敗しました。")
        return None

# 軽量版が必要な場合は以下を実行
# selected_model = download_gemma3n_light()

llama.cppで実行

!/content/llama.cpp/build/bin/llama-cli -m /content/gemma3n-e4b-fixed.gguf -p "Hello!" -n 20  -no-cnv

高度な実行（パラメータ調整版）

# Unslothが推奨する最適なパラメータを使用
!/content/llama.cpp/build/bin/llama-cli \
    -m /content/gemma3n-e4b-fixed.gguf \
    -p "Write a Python function to calculate fibonacci numbers:" \
    -n 256 \
    -ngl 99 \
    --temp 1.0 \
    --top-k 64 \
    --top-p 0.95 \
    --min-p 0.0 \
    --repeat-penalty 1.0 \
    -no-cnv

インタラクティブモード

# インタラクティブチャットモード
def start_interactive_chat():
    if not selected_model or not os.path.exists(selected_model):
        print("&#x274c; モデルファイルが見つかりません。")
        return

    print("&#x1f916; インタラクティブモードを開始します。'exit'で終了。")

    while True:
        user_input = input("\nあなた: ")
        if user_input.lower() in ['exit', 'quit', '終了']:
            print("&#x1f44b; チャットを終了します。")
            break

        # Unsloth推奨パラメータを使用
        command = f'''./build/bin/llama-cli \\
            -m "{selected_model}" \\
            --ctx-size 32768 \\
            --n-gpu-layers 99 \\
            --temp 1.0 \\
            --top-k 64 \\
            --top-p 0.95 \\
            --min-p 0.0 \\
            --repeat-penalty 1.0 \\
            -p "{user_input}" \\
            -n 256 \\
            --color'''

        os.system(command)

# インタラクティブチャットを開始
# start_interactive_chat()

サーバーモード

import subprocess
import time
import requests

# サーバーを起動（Unsloth推奨パラメータ使用）
def start_server():
    process = subprocess.Popen([
        '/content/llama.cpp/build/bin/llama-server',
        '-m', '/content/gemma3n-e4b-fixed.gguf',
        '--host', '0.0.0.0',
        '--port', '8081',
        '--n-gpu-layers', '99',
        '--ctx-size', '32768',  # Gemma 3nは32Kコンテキスト対応
        '--temp', '1.0',        # Unsloth推奨設定
        '--top-k', '64',
        '--top-p', '0.95',
        '--min-p', '0.0',
        '--repeat-penalty', '1.0'
    ])

    print("&#x1f680; サーバー起動中...")
    time.sleep(10)  # 起動まで待機
    print("&#x2705; サーバー起動完了")
    return process

# サーバー起動
server_process = start_server()

# サーバーにリクエスト送信
def query_server(prompt):
    try:
        response = requests.post("http://localhost:8081/completion",
            json={
                "prompt": prompt,
                "n_predict": 128,
                "temperature": 1.0,
                "top_k": 64,
                "top_p": 0.95,
                "min_p": 0.0,
                "repeat_penalty": 1.0
            },
            timeout=30
        )

        if response.status_code == 200:
            return response.json()["content"]
        else:
            return f"エラー: {response.status_code}"
    except Exception as e:
        return f"接続エラー: {e}"

# 使用例
result = query_server("Hello! How are you?")
print(result)

状態確認

# システム状態の確認
def check_system_status():
    print("=== システム状態確認 ===")

    # llama.cppの状態確認
    if os.path.exists('/content/llama.cpp/build/bin/llama-cli'):
        print("&#x2705; llama.cpp: ビルド済み")
    else:
        print("&#x274c; llama.cpp: ビルドが必要")

    # モデルファイルの状態確認
    if selected_model and os.path.exists(selected_model):
        file_size = os.path.getsize(selected_model) / (1024**3)
        print(f"&#x2705; 修正済みモデルファイル: {selected_model} ({file_size:.1f}GB)")
        print("&#x1f527; Unslothによる修正済み版を使用中")
    else:
        print("&#x274c; モデルファイル: 見つかりません")

    # GPU確認
    print("\n=== GPU情報 ===")
    os.system('nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits')

# 状態確認実行
check_system_status()

主な改良点

修正済みGGUFファイルの使用: Ollamaを使用せず、Unslothが修正したllama.cpp対応GGUFファイルを直接取得
重要なバグ修正: 通常のGemma3n-E4Bがllama.cppで動作しない問題を解決
ビルド時間短縮: 既存のビルドファイルがあれば再利用
自動ファイル管理: ダウンロード済みファイルの自動検出
エラーハンドリング: より堅牢なエラー処理
使いやすさ向上: ワンクリックでの実行

Unsloth版を使う理由

Unslothチームは、Google、Meta、Mistral、Microsoft等の主要なモデル開発チームと直接協力し、重要なバグ修正を行っています。特にGemma 3nに関しては：

Hugging FaceのXuan-Son NguyenとllamaチームのGeorgi Gerganovと協力してGemma 3nをllama.cppで動作するように修正
Gemma 3での未初期化ウェイトの特定
GGUFがOllamaで正常に動作しない問題を修正
最適な推論パラメータ（temperature = 1.0, top_k = 64, top_p = 0.95）の設定

Gemma 3nの特徴

マルチモーダル対応: テキスト、画像、音声、動画入力をサポート
32K コンテキスト長: 長い文書の処理が可能
30秒音声入力: 音声認識と翻訳に対応
140言語サポート: 多言語での動作
効率的なアーキテクチャ: E4Bは実質4Bパラメータながら8Bの性能

まとめ

これでHuggingFaceからUnslothの修正版を直接ダウンロードしたGemma3nを効率的にllama.cppで動かせます！通常のGoogle公式版では動作しないため、必ずUnsloth版を使用してください。