The most capable open model family built from Gemini 3 research. Frontier intelligence free, Apache 2.0, runs on your hardware.
# Install Ollama (Mac / Linux) curl -fsSL https://ollama.com/install.sh | sh # Pull and run Gemma 4 — choose your size ollama run gemma4:e2b # 2B — phones, Pi ollama run gemma4:e4b # 4B — laptop ollama run gemma4:26b # 26B MoE — workstation ★ recommended ollama run gemma4:31b # 31B dense — server / A100 # REST API (once running) curl http://localhost:11434/api/chat \ -d '{"model":"gemma4","messages":[{"role":"user","content":"Hello!"}]}'
from transformers import AutoProcessor, AutoModelForImageTextToText import torch # Load model — swap model_id for your chosen size model_id = "google/gemma-4-26b-it" # or e2b-it / e4b-it / 31b-it processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForImageTextToText.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) messages = [{ "role": "user", "content": [{"type": "text", "text": "Explain quantum entanglement."}] }] inputs = processor.apply_chat_template( messages, tokenize=True, return_dict=True, return_tensors="pt", add_generation_prompt=True, enable_thinking=True, # ← thinking mode! ).to(model.device) output = model.generate(**inputs, max_new_tokens=2048) print(processor.decode(output[0], skip_special_tokens=True))
# pip install google-genai from google import genai client = genai.Client(api_key="YOUR_API_KEY") response = client.models.generate_content( model="gemma-4-31b-it", # or gemma-4-26b-it contents="Write a haiku about open-source AI.", ) print(response.text) # ── REST equivalent ───────────────────────────────── # curl "https://generativelanguage.googleapis.com/v1beta/ # models/gemma-4-31b-it:generateContent?key=YOUR_KEY" \ # -H 'Content-Type: application/json' -X POST \ # -d '{"contents":[{"parts":[{"text":"Hello!"}]}]}'
# Install vLLM pip install vllm # Start OpenAI-compatible server vllm serve google/gemma-4-26b-it \ --dtype auto \ --max-model-len 65536 \ --tensor-parallel-size 2 # for multi-GPU # Query via OpenAI SDK from openai import OpenAI client = OpenAI( base_url="http://localhost:8000/v1", api_key="none", ) resp = client.chat.completions.create( model="google/gemma-4-26b-it", messages=[{"role":"user","content":"Hello Gemma!"}], ) print(resp.choices[0].message.content)
# Install MLX LM pip install mlx-lm # Run inference (4-bit quantized) mlx_lm.generate \ --model mlx-community/gemma-4-e4b-it-4bit \ --prompt "Explain transformers simply." \ --max-tokens 512 # Or 26B at 4-bit for M2 Max / M3 Max (≥64 GB) mlx_lm.generate \ --model mlx-community/gemma-4-26b-it-4bit \ --prompt "Write a Python quicksort." \ --max-tokens 1024 # Python API from mlx_lm import load, generate model, tokenizer = load("mlx-community/gemma-4-e4b-it-4bit") response = generate(model, tokenizer, prompt="Hello!", max_tokens=256) print(response)
# NVIDIA NIM — containerised, production-ready docker run --gpus all \ -p 8000:8000 \ nvcr.io/nim/google/gemma-4-27b-it:latest # Unsloth Fine-tune Studio (Mac / Linux / WSL) curl -fsSL https://unsloth.ai/install.sh | sh unsloth studio -H 0.0.0.0 -p 8888 # Dockerfile snippet for custom build FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 RUN pip install vllm transformers COPY serve.py /app/serve.py CMD ["python", "/app/serve.py"]
| Model | MMLU | HumanEval | MATH | GPQA | Arena.ai Rank |
|---|---|---|---|---|---|
| Gemma 4 31B ★ Open | #3 Global | ||||
| Gemma 4 26B A4B MoE | Top 5 | ||||
| GPT-4o | — | ||||
| Llama 3.3 70B | — | ||||
| Gemma 4 E4B Edge | Best Edge |