The Matchy Book
Matchy is a database for IP address and string matching. Matchy supports
matching IP addresses, CIDR ranges, exact strings, and glob patterns like *.evil.com with
microsecond-level query performance. You can build databases with structured data, query them
efficiently, and deploy them in multi-process applications with minimal memory overhead.
Sections
To get started with Matchy, install Matchy and create your first database.
The guide will give you all you need to know about how to use Matchy to create and query databases for IP matching, string matching, and pattern matching.
The reference covers the details of various areas of Matchy, including the Rust API, C API, binary format, and architecture.
The commands will let you interact with Matchy databases using the command-line interface.
Learn how to contribute to Matchy development.
Appendices:
Other Documentation:
- Changelog — Detailed notes about changes in Matchy in each release.
Getting Started
This section provides a quick introduction to Matchy. Choose your path based on how you plan to use Matchy:
Using the CLI
If you want to build and query databases from the command line, or integrate Matchy into shell scripts and workflows:
Best for: Operations, DevOps, quick prototyping, standalone tools
Using the API
If you’re building an application that needs to query databases programmatically:
Best for: Application development, embedded systems, language integration
Both paths create compatible databases - a database built with the CLI can be queried by the API and vice versa.
Quick Start
Get up and running with Matchy in minutes.
Installation
From Source
git clone https://github.com/matchylabs/matchy
cd matchy
cargo build --release
As a Rust Dependency
Add to your Cargo.toml:
[dependencies]
matchy = "0.5"
Your First Database (Rust)
Here’s a complete example that builds and queries a threat intelligence database:
use matchy::{Database, DatabaseBuilder, MatchMode, DataValue, QueryResult};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// 1. Create a builder
let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
// 2. Add IP address with threat data
let mut ip_data = HashMap::new();
ip_data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
ip_data.insert("score".to_string(), DataValue::Uint32(95));
builder.add_entry("1.2.3.4", ip_data)?;
// 3. Add CIDR range
let mut cidr_data = HashMap::new();
cidr_data.insert("type".to_string(), DataValue::String("internal".to_string()));
builder.add_entry("10.0.0.0/8", cidr_data)?;
// 4. Add glob pattern
let mut pattern_data = HashMap::new();
pattern_data.insert("category".to_string(), DataValue::String("malware".to_string()));
builder.add_entry("*.evil.com", pattern_data)?;
// 5. Build and save
let database_bytes = builder.build()?;
std::fs::write("threats.mxy", &database_bytes)?;
println!("✅ Database built: {} bytes", database_bytes.len());
// 6. Open database (memory-mapped)
let db = Database::open("threats.mxy")?;
println!("✅ Database loaded in <1ms");
// 7. Query IP address
match db.lookup("1.2.3.4")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("🔍 IP match: {:?} (/{prefix_len})", data);
}
_ => println!("No match"),
}
// 8. Query pattern
match db.lookup("malware.evil.com")? {
Some(QueryResult::Pattern { pattern_ids, data }) => {
println!("🔍 Pattern match: {} patterns", pattern_ids.len());
for (i, d) in data.iter().enumerate() {
if let Some(threat_data) = d {
println!(" Pattern {}: {:?}", pattern_ids[i], threat_data);
}
}
}
_ => println!("No match"),
}
Ok(())
}
Your First Database (C)
Complete C example:
#include "matchy.h"
#include <stdio.h>
int main() {
// 1. Build database
matchy_builder_t *builder = matchy_builder_new();
if (!builder) {
fprintf(stderr, "Failed to create builder\n");
return 1;
}
// 2. Add entries with JSON data
matchy_builder_add(builder, "1.2.3.4",
"{\"threat_level\": \"high\", \"score\": 95}");
matchy_builder_add(builder, "10.0.0.0/8",
"{\"type\": \"internal\"}");
matchy_builder_add(builder, "*.evil.com",
"{\"category\": \"malware\"}");
// 3. Save to file
int err = matchy_builder_save(builder, "threats.mxy");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to save database\n");
matchy_builder_free(builder);
return 1;
}
printf("✅ Database built\n");
matchy_builder_free(builder);
// 4. Open database
matchy_t *db = matchy_open("threats.mxy");
if (!db) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
printf("✅ Database loaded\n");
// 5. Query IP address
matchy_result_t result = matchy_query(db, "1.2.3.4");
if (result.found) {
char *json = matchy_result_to_json(&result);
printf("🔍 IP match: %s\n", json);
matchy_free_string(json);
matchy_free_result(&result);
}
// 6. Query pattern
result = matchy_query(db, "malware.evil.com");
if (result.found) {
char *json = matchy_result_to_json(&result);
printf("🔍 Pattern match: %s\n", json);
matchy_free_string(json);
matchy_free_result(&result);
}
// 7. Cleanup
matchy_close(db);
printf("✅ Done\n");
return 0;
}
Compile and run:
gcc -o example example.c -I./include -L./target/release -lmatchy
LD_LIBRARY_PATH=./target/release ./example
What Just Happened?
- Built a database - Added IPs, CIDR ranges, and patterns with structured data
- Saved to disk - Wrote optimized binary format (
.mxyfile) - Loaded instantly - Memory-mapped the file (<1ms load time)
- Queried efficiently - Looked up IPs and patterns in microseconds
Key Concepts
Automatic Type Detection
You don’t need to specify whether an entry is an IP, CIDR, or pattern. Matchy detects automatically:
#![allow(unused)]
fn main() {
builder.add_entry("1.2.3.4", data)?; // Detected as IP
builder.add_entry("10.0.0.0/8", data)?; // Detected as CIDR
builder.add_entry("*.evil.com", data)?; // Detected as glob pattern
builder.add_entry("evil.com", data)?; // Detected as exact string
}
Database Immutability
Databases are read-only once built. To update:
- Create new builder
- Add all entries (old + new + modified)
- Build new database
- Atomically replace old file
This ensures readers always see consistent state.
Memory Mapping
Databases use mmap() for:
- Instant loading - No deserialization overhead
- Memory efficiency - OS shares pages across processes
- Large databases - Work with databases larger than RAM
Next Steps
- Installation Guide - Detailed setup instructions
- Rust API Guide - Complete Rust API documentation
- C API Guide - Complete C API documentation
- Architecture - How Matchy works internally
Building Your First Database
This tutorial walks you through building a complete threat intelligence database from scratch.
What We’ll Build
A database containing:
- Malicious IP addresses with threat scores
- CIDR ranges for known botnets
- Domain patterns for phishing sites
- Exact domains on a blocklist
Step 1: Create the Builder
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchMode, DataValue};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
}
The MatchMode determines how patterns are matched:
CaseSensitive- “Evil.com” ≠ “evil.com”CaseInsensitive- “Evil.com” = “evil.com” (recommended for domains)
Step 2: Add IP Addresses
Add individual IPs with rich metadata:
#![allow(unused)]
fn main() {
let mut threat_data = HashMap::new();
threat_data.insert("threat_level".to_string(), DataValue::String("critical".to_string()));
threat_data.insert("score".to_string(), DataValue::Uint32(95));
threat_data.insert("first_seen".to_string(), DataValue::String("2024-01-15".to_string()));
threat_data.insert("category".to_string(), DataValue::String("c2_server".to_string()));
builder.add_entry("192.0.2.1", threat_data)?;
}
Step 3: Add CIDR Ranges
CIDR ranges match all IPs within the range:
#![allow(unused)]
fn main() {
let mut botnet_data = HashMap::new();
botnet_data.insert("network".to_string(), DataValue::String("mirai_botnet".to_string()));
botnet_data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
builder.add_entry("203.0.113.0/24", botnet_data)?;
}
Step 4: Add Glob Patterns
Patterns use wildcards to match multiple domains:
#![allow(unused)]
fn main() {
// Match any subdomain of evil.com
let mut pattern_data = HashMap::new();
pattern_data.insert("category".to_string(), DataValue::String("phishing".to_string()));
pattern_data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
builder.add_entry("*.evil.com", pattern_data)?;
// Match specific patterns
let mut malware_data = HashMap::new();
malware_data.insert("category".to_string(), DataValue::String("malware_download".to_string()));
builder.add_entry("http://*/admin/config.php", malware_data)?;
}
Step 5: Add Exact Strings
For known exact matches (no wildcards):
#![allow(unused)]
fn main() {
let mut blocklist_data = HashMap::new();
blocklist_data.insert("reason".to_string(), DataValue::String("confirmed_malware".to_string()));
blocklist_data.insert("blocked_date".to_string(), DataValue::String("2024-10-01".to_string()));
builder.add_entry("malicious-site.example", blocklist_data)?;
}
Step 6: Build and Save
#![allow(unused)]
fn main() {
// Build the database (returns bytes)
let database_bytes = builder.build()?;
// Save to file
std::fs::write("threats.mxy", &database_bytes)?;
println!("✅ Database built: {} bytes", database_bytes.len());
}
Step 7: Query the Database
#![allow(unused)]
fn main() {
use matchy::{Database, QueryResult};
// Open the database (memory-mapped, loads in <1ms)
let db = Database::open("threats.mxy")?;
// Query IP address
match db.lookup("192.0.2.1")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("Found IP: {:?}", data);
println!("Matched CIDR: /{}", prefix_len);
}
_ => println!("Not found"),
}
// Query domain (matches pattern *.evil.com)
match db.lookup("phishing.evil.com")? {
Some(QueryResult::Pattern { pattern_ids, data }) => {
println!("Matched {} patterns", pattern_ids.len());
for (i, threat_data) in data.iter().enumerate() {
if let Some(d) = threat_data {
println!("Pattern {}: {:?}", pattern_ids[i], d);
}
}
}
_ => println!("No match"),
}
}
Pattern Types
Matchy automatically detects entry types:
| Entry | Type | Example |
|---|---|---|
192.0.2.1 | IP Address | Single host |
192.0.2.0/24 | CIDR Range | Network block |
*.evil.com | Glob Pattern | Wildcard domain |
evil.com | Exact String | Literal match |
Performance Tips
- Build once, query many - Building is one-time, queries are microseconds
- Use CIDR ranges - More efficient than individual IPs
- Prefer suffix patterns -
*.evil.comis faster thanevil-* - Exact strings are fastest - O(1) hash lookup
Next Steps
- Rust API Reference - Complete API documentation
- Data Types - All supported data types
- Performance Guide - Optimization techniques
Using the CLI
The Matchy command-line interface lets you build and query databases without writing code. This is perfect for:
- Operations and DevOps workflows
- Quick prototyping and testing
- Shell scripts and automation
- One-off queries and analysis
What You’ll Learn
- Installing the CLI - Install the
matchycommand-line tool - First Database with CLI - Build and query your first database
Example Workflow
$ # Build a database from a CSV file
$ matchy build threats.csv -o threats.mxy
$ # Query it
$ matchy query threats.mxy 192.0.2.1
Found: IP address 192.0.2.1
threat_level: "high"
category: "malware"
$ # Benchmark performance
$ matchy bench threats.mxy
Queries per second: 7,234,891
Average latency: 138ns
After completing this section, check out the CLI Commands reference for detailed documentation on all available commands.
Installing the CLI
Prerequisites
The Matchy CLI requires Rust to build. If you don’t have Rust installed:
$ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
Verify installation:
$ rustc --version
rustc 1.70.0 (or later)
Installing from crates.io
The easiest way to install the Matchy CLI is from crates.io:
$ cargo install matchy
Updating crates.io index
Downloaded matchy v2.0.0
Compiling matchy v2.0.0
Finished release [optimized] target(s) in 2m 15s
Installing ~/.cargo/bin/matchy
Verify the installation:
$ matchy --version
matchy 2.0.0
Installing from source
To install the latest development version:
$ git clone https://github.com/matchylabs/matchy
$ cd matchy
$ cargo install --path .
Using without installation
You can also run Matchy directly from the source repository without installing:
$ git clone https://github.com/matchylabs/matchy
$ cd matchy
$ cargo run --release -- --version
matchy 2.0.0
Use cargo run --release -- instead of matchy for all commands.
Next Steps
Now that you have the CLI installed, let’s build your first database:
First Database with CLI
Let’s build and query a database using the Matchy CLI.
Create input data
First, create a CSV file with some sample data. Create a file called threats.csv:
key,threat_level,category
192.0.2.1,high,malware
203.0.113.0/24,medium,botnet
*.evil.com,high,phishing
malicious-site.com,critical,c2_server
Each row defines an entry:
key- IP address, CIDR range, pattern, or exact string- Other columns become data fields associated with the entry
Build the database
Use matchy build to create a database:
$ matchy build threats.csv -o threats.mxy
Building database from threats.csv
Added 4 entries
Database size: 2,847 bytes
Successfully wrote threats.mxy
This creates threats.mxy, a binary database file.
Query the database
Now query it with matchy query:
$ matchy query threats.mxy 192.0.2.1
Found: IP address 192.0.2.1
threat_level: "high"
category: "malware"
The CLI automatically detects that 192.0.2.1 is an IP address and performs an IP lookup.
Query a CIDR range
IPs within a CIDR range match that range:
$ matchy query threats.mxy 203.0.113.42
Found: IP address 203.0.113.42 (matched 203.0.113.0/24)
threat_level: "medium"
category: "botnet"
Query a pattern
Patterns match using wildcards:
$ matchy query threats.mxy phishing.evil.com
Found: Pattern match
Matched patterns: *.evil.com
threat_level: "high"
category: "phishing"
The domain phishing.evil.com matches the pattern *.evil.com.
Query an exact string
Exact strings must match completely:
$ matchy query threats.mxy malicious-site.com
Found: Exact string match
threat_level: "critical"
category: "c2_server"
Inspect the database
Use matchy inspect to see what’s inside:
$ matchy inspect threats.mxy
Database: threats.mxy
Size: 2,847 bytes
Match mode: CaseInsensitive
IP entries: 2
String entries: 1
Pattern entries: 1
Performance estimate:
IP queries: ~7M/sec
Pattern queries: ~2M/sec
Benchmark performance
Test query performance with matchy bench:
$ matchy bench threats.mxy
Running benchmarks on threats.mxy...
IP lookups: 7,234,891 queries/sec (138ns avg)
Pattern lookups: 2,156,892 queries/sec (463ns avg)
String lookups: 8,932,441 queries/sec (112ns avg)
Input formats
The CLI supports multiple input formats:
- CSV - Comma-separated values (shown above)
- JSON - One JSON object per line
- JSONL - JSON Lines format
- TSV - Tab-separated values
See Input File Formats for details.
What just happened?
You just:
- Created a CSV file with threat data
- Built a binary database (
threats.mxy) - Queried IPs, CIDR ranges, patterns, and exact strings
- Inspected the database structure
- Benchmarked query performance
The database loads in under 1ms using memory mapping, making it perfect for production use in high-throughput applications.
Going further
- CLI Commands Reference - Complete CLI documentation
- Input File Formats - All supported input formats
- Matchy Guide - Deeper dive into Matchy concepts
To integrate Matchy into your application code, see Using the API.
Using the API
The Matchy API lets you build and query databases programmatically from your application code. This is perfect for:
- Application development (servers, services, tools)
- Embedded systems and constrained environments
- Language integration (Rust, C/C++, Python, etc.)
- Custom data processing pipelines
What You’ll Learn
- Installing as a Library - Add Matchy to your project
- First Database with Rust - Build and query using Rust
- First Database with C - Build and query using C/C++
Example (Rust)
#![allow(unused)]
fn main() {
use matchy::{Database, DatabaseBuilder, MatchMode, DataValue};
use std::collections::HashMap;
// Build database
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
let mut data = HashMap::new();
data.insert("threat".to_string(), DataValue::String("high".to_string()));
builder.add_entry("192.0.2.1", data)?;
let db_bytes = builder.build()?;
std::fs::write("threats.mxy", &db_bytes)?;
// Query database
let db = Database::open("threats.mxy")?;
if let Some(result) = db.lookup("192.0.2.1")? {
println!("Found: {:?}", result);
}
}
Example (C)
#include "matchy.h"
// Build database
matchy_builder_t *builder = matchy_builder_new();
matchy_builder_add(builder, "192.0.2.1", "{\"threat\": \"high\"}");
matchy_builder_save(builder, "threats.mxy");
matchy_builder_free(builder);
// Query database
matchy_t *db = matchy_open("threats.mxy");
matchy_result_t result = matchy_query(db, "192.0.2.1");
if (result.found) {
char *json = matchy_result_to_json(&result);
printf("Found: %s\n", json);
matchy_free_string(json);
matchy_free_result(&result);
}
matchy_close(db);
Going further
After completing this section, check out:
- Matchy Guide - Deeper dive into concepts
- Rust API Reference - Complete Rust API docs
- C API Reference - Complete C API docs
Installing as a Library
For Rust Projects
Add Matchy to your Cargo.toml:
Full Installation (includes CLI dependencies)
[dependencies]
matchy = "2.0"
Library Only (minimal dependencies)
If you’re only using Matchy as a library and don’t need CLI components, save ~40 transitive dependencies:
[dependencies]
matchy = { version = "2.0", default-features = false }
This excludes CLI-only dependencies (clap, notify, ctrlc, csv) while keeping all core functionality.
Then run cargo build:
$ cargo build
Updating crates.io index
Downloading matchy v2.0
Compiling matchy v2.0
Compiling your-project v0.1.0
That’s it! You can now use Matchy in your Rust code.
For C/C++ Projects
Option 1: Using cargo-c (Recommended)
Install the system-wide C library:
$ cargo install cargo-c
$ git clone https://github.com/matchylabs/matchy
$ cd matchy
$ cargo cinstall --release --prefix=/usr/local
This installs:
- Headers to
/usr/local/include/matchy/ - Library to
/usr/local/lib/ - pkg-config file to
/usr/local/lib/pkgconfig/
Compile your project:
$ gcc myapp.c $(pkg-config --cflags --libs matchy) -o myapp
Option 2: Manual Installation
- Build the library:
$ git clone https://github.com/matchylabs/matchy
$ cd matchy
$ cargo build --release
- Copy files:
$ sudo cp target/release/libmatchy.* /usr/local/lib/
$ sudo cp include/matchy.h /usr/local/include/
- Update library cache (Linux):
$ sudo ldconfig
- Compile your project:
$ gcc myapp.c -I/usr/local/include -L/usr/local/lib -lmatchy -o myapp
For Other Languages
Matchy provides a C API that can be called from any language with C FFI support:
- Python: Use
ctypesorcffi - Go: Use
cgo - Node.js: Use
node-ffiornapi - Ruby: Use
fiddleorffi
See the C API Reference for the full API specification.
Next Steps
Choose your language:
First Database with Rust
Let’s build and query a database using the Rust API.
Create a new project
$ cargo new --bin matchy-example
$ cd matchy-example
Add Matchy to Cargo.toml:
[dependencies]
matchy = "2.0"
Write the code
Edit src/main.rs:
use matchy::{Database, DatabaseBuilder, MatchMode, DataValue, QueryResult};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create a builder
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Add an IP address
let mut ip_data = HashMap::new();
ip_data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
ip_data.insert("category".to_string(), DataValue::String("malware".to_string()));
builder.add_entry("192.0.2.1", ip_data)?;
// Add a CIDR range
let mut cidr_data = HashMap::new();
cidr_data.insert("network".to_string(), DataValue::String("internal".to_string()));
builder.add_entry("10.0.0.0/8", cidr_data)?;
// Add a pattern
let mut pattern_data = HashMap::new();
pattern_data.insert("category".to_string(), DataValue::String("phishing".to_string()));
builder.add_entry("*.evil.com", pattern_data)?;
// Build and save
let database_bytes = builder.build()?;
std::fs::write("threats.mxy", &database_bytes)?;
println!("✅ Built database: {} bytes", database_bytes.len());
// Open the database (memory-mapped)
let db = Database::open("threats.mxy")?;
println!("✅ Loaded database");
// Query an IP address
match db.lookup("192.0.2.1")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("🔍 IP match (/{}):", prefix_len);
println!(" {:?}", data);
}
_ => println!("Not found"),
}
// Query a pattern
match db.lookup("phishing.evil.com")? {
Some(QueryResult::Pattern { pattern_ids, data }) => {
println!("🔍 Pattern match:");
println!(" Matched {} pattern(s)", pattern_ids.len());
println!(" {:?}", data[0]);
}
_ => println!("Not found"),
}
Ok(())
}
Run it
$ cargo run
Compiling matchy v2.0
Compiling matchy-example v0.1.0
Finished dev [unoptimized] target(s)
Running `target/debug/matchy-example`
✅ Built database: 2847 bytes
✅ Loaded database
🔍 IP match (/32):
{"threat_level": String("high"), "category": String("malware")}
🔍 Pattern match:
Matched 1 pattern(s)
Some({"category": String("phishing")})
Understanding the code
1. Create a DatabaseBuilder
#![allow(unused)]
fn main() {
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
}
The match mode determines whether string comparisons are case-sensitive.
CaseInsensitive is recommended for domain matching.
2. Add entries
#![allow(unused)]
fn main() {
builder.add_entry("192.0.2.1", ip_data)?;
}
The add_entry method accepts any string key and a HashMap<String, DataValue> for the
associated data. Matchy automatically detects whether the key is an IP, CIDR, pattern, or
exact string.
Advanced: For explicit control over entry types, use type-specific methods:
#![allow(unused)]
fn main() {
builder.add_ip("192.0.2.1", data)?; // Force IP
builder.add_literal("*.txt", data)?; // Force exact match (no wildcard)
builder.add_glob("*.evil.com", data)?; // Force pattern
}
Or use type prefixes with add_entry:
#![allow(unused)]
fn main() {
builder.add_entry("literal:file*.txt", data)?; // Match literal asterisk
builder.add_entry("glob:simple.com", data)?; // Force pattern matching
}
See Entry Types - Prefix Technique for details.
3. Build the database
#![allow(unused)]
fn main() {
let database_bytes = builder.build()?;
std::fs::write("threats.mxy", &database_bytes)?;
}
The build() method produces a Vec<u8> containing the optimized binary database. You
can write it to a file or transmit it over a network.
4. Open and query
#![allow(unused)]
fn main() {
let db = Database::open("threats.mxy")?;
let result = db.lookup("192.0.2.1")?;
}
Database::open() memory-maps the file, loading it in under 1ms. The lookup() method
returns an Option<QueryResult> that indicates whether a match was found and what type
of match it was.
Data types
Matchy supports several data value types:
#![allow(unused)]
fn main() {
use matchy::DataValue;
let mut data = HashMap::new();
data.insert("string".to_string(), DataValue::String("text".to_string()));
data.insert("integer".to_string(), DataValue::Uint32(42));
data.insert("float".to_string(), DataValue::Double(3.14));
data.insert("boolean".to_string(), DataValue::Bool(true));
data.insert("array".to_string(), DataValue::Array(vec![
DataValue::String("one".to_string()),
DataValue::String("two".to_string()),
]));
}
See Data Types and Values for complete details.
Error handling
All Matchy operations return Result<T, MatchyError>:
#![allow(unused)]
fn main() {
match db.lookup("192.0.2.1") {
Ok(Some(result)) => println!("Found: {:?}", result),
Ok(None) => println!("Not found"),
Err(e) => eprintln!("Error: {}", e),
}
}
Going further
- Matchy Guide - Deeper dive into concepts
- Rust API Reference - Complete API documentation
- Data Types - All supported data types
- Pattern Matching - Glob pattern syntax
First Database with C
Let’s build and query a database using the C API.
Create a source file
Create example.c:
#include "matchy.h"
#include <stdio.h>
#include <stdlib.h>
int main() {
// Create a builder
matchy_builder_t *builder = matchy_builder_new();
if (!builder) {
fprintf(stderr, "Failed to create builder\n");
return 1;
}
// Add entries with JSON data
int err = matchy_builder_add(builder, "192.0.2.1",
"{\"threat_level\": \"high\", \"category\": \"malware\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add IP entry\n");
matchy_builder_free(builder);
return 1;
}
matchy_builder_add(builder, "10.0.0.0/8",
"{\"network\": \"internal\"}");
matchy_builder_add(builder, "*.evil.com",
"{\"category\": \"phishing\"}");
// Save to file
err = matchy_builder_save(builder, "threats.mxy");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to save database\n");
matchy_builder_free(builder);
return 1;
}
printf("✅ Built database\n");
matchy_builder_free(builder);
// Open the database
matchy_t *db = matchy_open("threats.mxy");
if (!db) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
printf("✅ Loaded database\n");
// Query an IP address
matchy_result_t result = matchy_query(db, "192.0.2.1");
if (result.found) {
char *json = matchy_result_to_json(&result);
printf("🔍 IP match: %s\n", json);
matchy_free_string(json);
matchy_free_result(&result);
}
// Query a pattern
result = matchy_query(db, "phishing.evil.com");
if (result.found) {
char *json = matchy_result_to_json(&result);
printf("🔍 Pattern match: %s\n", json);
matchy_free_string(json);
matchy_free_result(&result);
}
// Cleanup
matchy_close(db);
printf("✅ Done\n");
return 0;
}
Compile and run
$ gcc -o example example.c -I/usr/local/include -L/usr/local/lib -lmatchy
$ ./example
✅ Built database
✅ Loaded database
🔍 IP match: {"threat_level":"high","category":"malware"}
🔍 Pattern match: {"category":"phishing"}
✅ Done
If you get “library not found” errors:
$ export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH # Linux
$ export DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH # macOS
Understanding the code
1. Create a builder
matchy_builder_t *builder = matchy_builder_new();
The builder is an opaque handle. Always check for NULL on creation.
2. Add entries
matchy_builder_add(builder, "192.0.2.1",
"{\"threat_level\": \"high\", \"category\": \"malware\"}");
The C API uses JSON strings for data. Matchy automatically detects whether the key is an IP, CIDR, pattern, or exact string.
3. Save the database
int err = matchy_builder_save(builder, "threats.mxy");
Returns MATCHY_SUCCESS (0) on success, or an error code otherwise.
4. Open and query
matchy_t *db = matchy_open("threats.mxy");
matchy_result_t result = matchy_query(db, "192.0.2.1");
The database is memory-mapped for instant loading. Check result.found to see if
a match was found.
Note: For FFI systems that have issues with return-by-value structs (like some Java JNA configurations on ARM64), use
matchy_query_into()instead:matchy_result_t result; matchy_query_into(db, "192.0.2.1", &result);Both functions are equivalent;
matchy_query_into()writes to a pointer you provide.
5. Cleanup
matchy_free_result(&result);
matchy_close(db);
matchy_builder_free(builder);
Always free resources when done. The C API uses manual memory management.
Error handling
Check return values:
int err = matchy_builder_add(builder, key, data);
if (err != MATCHY_SUCCESS) {
const char *msg = matchy_error_message(err);
fprintf(stderr, "Error: %s\n", msg);
}
Error codes:
MATCHY_SUCCESS(0) - Operation succeededMATCHY_ERROR_INVALID_PARAM- NULL pointer or invalid parameterMATCHY_ERROR_FILE_NOT_FOUND- File doesn’t existMATCHY_ERROR_INVALID_FORMAT- Corrupt or wrong formatMATCHY_ERROR_PARSE_FAILED- JSON parsing failedMATCHY_ERROR_UNKNOWN- Other error
Memory management
The C API follows these rules:
-
Strings returned by Matchy must be freed:
char *json = matchy_result_to_json(&result); // Use json... matchy_free_string(json); -
Results must be freed:
matchy_result_t result = matchy_query(db, "key"); // Use result... matchy_free_result(&result); -
Handles must be freed:
matchy_builder_free(builder); matchy_close(db);
See C Memory Management for complete details.
Thread safety
- Database handles (
matchy_t*) are thread-safe for concurrent queries - Builder handles (
matchy_builder_t*) are NOT thread-safe - Don’t share a builder across threads
- Multiple threads can safely query the same database
Going further
- C API Reference - Complete C API documentation
- C Memory Management - Memory rules and patterns
- Matchy Guide - Deeper dive into concepts
Matchy Guide
This guide covers the concepts you need to understand how Matchy works, regardless of whether you’re using the CLI, Rust API, or C API.
If you’re looking for tool-specific instructions, see:
- Getting Started - First time using CLI or API
- CLI Commands - CLI reference
- Rust API Reference - Rust API reference
- C API Reference - C API reference
Concepts
- Why Matchy Exists
- Database Concepts
- Entry Types
- Pattern Matching
- Data Types and Values
- Query Result Caching
- Pattern Extraction
- MMDB Compatibility
- Migrating from libmaxminddb
- Performance Considerations
Why Matchy Exists
The Problem
Many applications need to match IP addresses and strings against large datasets. Common use cases include:
- Threat intelligence: checking IPs and domains against blocklists
- GeoIP lookups: finding location data for IP addresses
- Domain categorization: classifying websites by patterns
- Network security: matching against indicators of compromise
Traditional approaches have significant limitations:
Hash tables provide fast exact lookups, but can’t match patterns. You can’t use a hash
table to match phishing.evil.com against a pattern like *.evil.com.
Sequential scanning works for patterns but doesn’t scale. With 10,000 patterns, you perform 10,000 comparisons per lookup. This approach quickly becomes a bottleneck.
Multiple data structures add complexity. Using a hash table for exact matches, a tree for IP ranges, and pattern matching for domains means maintaining three separate systems.
Serialization overhead slows down loading. Traditional databases need to parse and deserialize data on startup, which can take hundreds of milliseconds or more.
Memory duplication wastes resources. In multi-process applications, each process loads its own copy of the database, multiplying memory usage.
The Solution
Matchy addresses these problems with a unified approach:
Automatic type detection means one database holds IPs, CIDR ranges, exact strings, and patterns. You don’t need to know which type you’re querying - Matchy figures it out.
Optimized data structures provide efficient lookups for each type. IPs use a binary search tree. Exact strings use hash tables. Patterns use the Aho-Corasick algorithm.
Memory mapping eliminates deserialization. Databases are memory-mapped files that load in under a millisecond. The operating system shares pages across processes automatically.
Compact binary format reduces size. Matchy uses a space-efficient binary representation similar to MaxMind’s MMDB format.
Performance
A typical Matchy database can perform:
- 7M+ IP address lookups per second
- 1M+ pattern matches per second (with 50,000 patterns)
- Sub-microsecond latency for individual queries
- Sub-millisecond loading time
Compatibility
Matchy can read standard MaxMind MMDB files, making it a drop-in replacement for GeoIP databases. It extends the MMDB format to support string matching and patterns while maintaining compatibility with existing files.
When to Use Matchy
Matchy is designed for applications that need:
- Fast lookups against large datasets
- Pattern matching in addition to exact matches
- IP address and string matching in the same database
- Minimal memory overhead in multi-process architectures
- Quick database loading without deserialization
If you only need exact string matching and already have a solution that works, Matchy might be overkill. But if you need patterns, IPs, and efficiency at scale, Matchy was built for you.
Database Concepts
This chapter covers the fundamental concepts of Matchy databases.
What is a Database?
A Matchy database is a binary file containing:
- Entries - IP addresses, CIDR ranges, patterns, or exact strings
- Data - Structured information associated with each entry
- Indexes - Optimized data structures for fast lookups
Databases use the .mxy extension by convention, though any extension works.
Immutability
Databases are read-only once built. You cannot add, remove, or modify entries in an existing database.
To update a database:
- Create a new builder
- Add all entries (old + new + modified)
- Build the new database
- Atomically replace the old file
This ensures readers always see consistent state and enables safe concurrent access.
Entry Types
Matchy automatically detects four types of entries:
| Entry Type | Example | Matches |
|---|---|---|
| IP Address | 192.0.2.1 | Exact IP address |
| CIDR Range | 10.0.0.0/8 | All IPs in range |
| Pattern | *.example.com | Strings matching glob |
| Exact String | example.com | Exact string only |
You don’t need to specify the type - Matchy infers it from the format.
Auto-Detection
When you query a database, Matchy automatically:
- Checks if the query is an IP address → searches IP tree
- Checks for exact string match → searches hash table
- Searches patterns → uses Aho-Corasick algorithm
This makes querying simple: db.lookup("anything") works for all types.
Memory Mapping
Databases use memory mapping (mmap) for instant loading:
Traditional Database Matchy Database
───────────────────── ─────────────────
1. Open file 1. Open file
2. Read into memory 2. Memory map
3. Parse format 3. Done! (<1ms)
4. Build data structures
(100-500ms for large DB)
Memory mapping has several benefits:
Instant loading - Databases load in under 1 millisecond regardless of size.
Shared memory - The OS shares memory-mapped pages across processes automatically:
- 64 processes with a 100MB database = ~100MB RAM total
- Traditional approach = 64 × 100MB = 6,400MB RAM
Large databases - Work with databases larger than available RAM. The OS pages data in and out as needed.
Binary Format
Databases use a compact binary format based on MaxMind’s MMDB specification:
- IP tree - Binary trie for IP address lookups (MMDB compatible)
- Hash table - For exact string matches (Matchy extension)
- Aho-Corasick automaton - For pattern matching (Matchy extension)
- Data section - Structured data storage (MMDB compatible)
This means:
- Standard MMDB readers can read the IP portion
- Matchy can read standard MMDB files (like GeoIP databases)
- Cross-platform compatible (same file works on Linux, macOS, Windows)
Building a Database
The general workflow is:
- Create a builder - Specify match mode (case-sensitive or not)
- Add entries - Add IP addresses, patterns, strings with associated data
- Build - Generate optimized binary format
- Save - Write to file
How to build:
Querying a Database
The query process:
- Open database - Memory map the file
- Query - Call lookup with any string
- Get result - Receive match data or None
How to query:
Query Results
Queries return one of:
- IP match - IP address or CIDR range matched
- Pattern match - One or more patterns matched
- Exact match - Exact string matched
- No match - No entries matched
For pattern matches, Matchy returns all matching patterns and their associated data.
This is useful when multiple patterns match (e.g., *.com and example.* both match
example.com).
Database Size
Database size depends on:
- Number of entries
- Pattern complexity (more patterns = larger automaton)
- Data size (structured data per entry)
Typical sizes:
- 1,000 entries - ~50-100KB
- 10,000 entries - ~500KB-1MB
- 100,000 entries - ~5-10MB
- 1,000,000 entries - ~50-100MB
Pattern-heavy databases are larger due to the Aho-Corasick automaton.
Thread Safety
Databases are thread-safe for concurrent queries:
- Multiple threads can safely query the same database
- Memory-mapped data is read-only
- No locking required
Builders are NOT thread-safe:
- Don’t share a builder across threads
- Build databases sequentially
Compatibility
Databases are:
- ✅ Platform-independent - Same file on Linux, macOS, Windows
- ✅ Tool-independent - CLI-built databases work with APIs
- ✅ Language-independent - Rust-built databases work with C
- ✅ MMDB-compatible - Can read standard MaxMind databases
Next Steps
Now that you understand database concepts, dive into specific topics:
- Entry Types - Deep dive on IP, CIDR, patterns, strings
- Pattern Matching - Glob syntax and matching rules
- Data Types and Values - What data you can store
- Performance Considerations - Optimization strategies
Entry Types
Matchy supports four types of entries, automatically detected based on the format of the key.
IP Addresses
Format: Standard IPv4 or IPv6 address notation
Examples:
192.0.2.12001:db8::110.0.0.1
Matching: Exact IP address only
Entry: 192.0.2.1
Matches: 192.0.2.1
Doesn't match: 192.0.2.2, 192.0.2.0
Use cases:
- Known malicious IPs
- Specific hosts
- Allowlist/blocklist
CIDR Ranges
Format: IP address with subnet mask (slash notation)
Examples:
10.0.0.0/8192.168.0.0/162001:db8::/32
Matching: All IP addresses within the range
Entry: 10.0.0.0/8
Matches: 10.0.0.1, 10.255.255.255, 10.123.45.67
Doesn't match: 11.0.0.1, 9.255.255.255
The number after the slash indicates how many bits are fixed:
/8- First 8 bits fixed (~16.7 million addresses)/16- First 16 bits fixed (~65,000 addresses)/24- First 24 bits fixed (256 addresses)/32- All 32 bits fixed (single address, equivalent to IP entry)
Use cases:
- Network blocks
- Organization IP ranges
- Geographic regions
- Cloud provider ranges
Best practice: Use CIDR ranges instead of individual IPs when possible. It’s more efficient than adding thousands of individual IP addresses.
Patterns (Globs)
Format: String containing wildcard characters (* or ?)
Examples:
*.example.comtest-*.domain.comhttp://*/admin/*
Matching: Strings matching the glob pattern
Entry: *.example.com
Matches: foo.example.com, bar.example.com, sub.domain.example.com
Doesn't match: example.com, example.com.foo
Wildcard rules:
*- Matches zero or more of any character?- Matches exactly one character[abc]- Matches one character from the set[!abc]- Matches one character NOT in the set
See Pattern Matching for complete syntax details.
Use cases:
- Domain wildcards (malware families)
- URL patterns
- Flexible matching rules
- Category-based blocking
Performance: Pattern matching uses the Aho-Corasick algorithm, which searches for all patterns simultaneously. Query time is roughly constant regardless of the number of patterns (within reason).
Exact Strings
Format: Any string without wildcard characters and not an IP/CIDR
Examples:
example.commalicious-site.nettest-string-123
Matching: Exact string only (case-sensitive or insensitive based on match mode)
Entry: example.com
Matches: example.com (case-insensitive mode: Example.com, EXAMPLE.COM)
Doesn't match: foo.example.com, example.com/path
Use cases:
- Known malicious domains
- Exact matches
- High-confidence indicators
- Allowlists
Performance: Exact strings use hash table lookups (O(1) constant time), making them the fastest entry type.
Auto-Detection
Matchy automatically determines the entry type:
Input Detected As
───────────────────── ─────────────
192.0.2.1 IP Address
10.0.0.0/8 CIDR Range
*.example.com Pattern
example.com Exact String
test-* Pattern
test.com Exact String
You don’t need to specify the type - Matchy infers it from the format.
Explicit Type Control (Prefix Technique)
Sometimes auto-detection doesn’t match your intent. Use type prefixes to force a specific entry type:
Available Prefixes
| Prefix | Type | Description |
|---|---|---|
literal: | Exact String | Force exact match (no wildcards) |
glob: | Pattern | Force glob pattern matching |
ip: | IP/CIDR | Force IP address parsing |
Why Use Prefixes?
Problem 1: Literal strings that look like patterns
Some strings contain characters like *, ?, or [ that should be matched literally,
not as wildcards:
Without prefix:
file*.txt → Detected as pattern (matches file123.txt, fileabc.txt)
With prefix:
literal:file*.txt → Exact match only (matches "file*.txt" literally)
Problem 2: Patterns without wildcards
You might want to match a string as a pattern for consistency, even without wildcards:
Without prefix:
example.com → Detected as exact string
With prefix:
glob:example.com → Treated as pattern (useful for batch processing)
Problem 3: Ambiguous IP-like strings
Force IP parsing when needed:
With prefix:
ip:192.168.1.1 → Explicitly parsed as IP
Usage Examples
Text file input:
# Auto-detected
192.0.2.1
*.evil.com
malware.com
# Explicit control
literal:*.not-a-glob.com
glob:no-wildcards.com
ip:10.0.0.1
CSV input:
entry,category
literal:test[1].txt,filesystem
glob:*.example.com,pattern
ip:192.168.1.0/24,network
JSON input:
[
{"key": "literal:file[backup].tar", "data": {"type": "archive"}},
{"key": "glob:*.example.*", "data": {"category": "domain"}},
{"key": "ip:10.0.0.0/8", "data": {"range": "private"}}
]
Rust API:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchMode};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
// Auto-detection handles most cases
builder.add_entry("*.example.com", HashMap::new())?;
// Use prefixes when needed
builder.add_entry("literal:file*.txt", HashMap::new())?;
builder.add_entry("glob:simple-string", HashMap::new())?;
}
Prefix Stripping
The prefix is automatically stripped before processing:
Input: literal:*.example.com
Stored as: *.example.com (as exact string)
Matches: Only the exact string "*.example.com"
Input: glob:test.com
Stored as: test.com (as pattern)
Matches: Strings matching pattern "test.com"
Validation
Prefixes enforce validation:
# This will fail - invalid glob syntax
glob:[unclosed-bracket
# This will fail - invalid IP address
ip:not-an-ip-address
# literal: accepts anything (no validation)
literal:[any$pecial*chars]
When to Use
Use prefixes when:
- ✅ String contains
*,?, or[that should be matched literally - ✅ Processing mixed data where type is known externally
- ✅ Building programmatically from heterogeneous sources
- ✅ Debugging auto-detection issues
Don’t use prefixes when:
- ❌ Auto-detection works correctly (most cases)
- ❌ All entries are the same type (use format-specific method instead)
- ❌ Creating database manually (use
add_ip(),add_literal(),add_glob()methods)
API Alternatives
Instead of using prefixes with add_entry(), you can call type-specific methods:
Rust API:
#![allow(unused)]
fn main() {
// Using prefix
builder.add_entry("literal:*.txt", data)?;
// Using explicit method (preferred in Rust)
builder.add_literal("*.txt", data)?;
}
Available methods:
builder.add_ip(key, data)- Force IP/CIDRbuilder.add_literal(key, data)- Force exact stringbuilder.add_glob(key, data)- Force patternbuilder.add_entry(key, data)- Auto-detect (with prefix support)
See DatabaseBuilder API for details.
Match Precedence
When querying, Matchy checks in this order:
- IP address - If the query is a valid IP, search IP tree
- Exact string - Check hash table for exact match
- Patterns - Search for matching patterns
This means:
- IP queries are fastest (binary tree lookup)
- Exact strings are next fastest (hash table lookup)
- Pattern queries search all patterns (Aho-Corasick)
Multiple Matches
A query can match multiple entries:
Example:
Entries:
- *.com
- *.example.com
- evil.example.com
Query: evil.example.com
Matches: All three patterns!
Matchy returns all matching entries for pattern queries. This lets you apply multiple rules or categories to a single query.
Combining Entry Types
A single database can contain all entry types:
Database contents:
- 192.0.2.1 (IP)
- 10.0.0.0/8 (CIDR)
- *.evil.com (pattern)
- malware.com (exact string)
Query 192.0.2.1 → IP match
Query 10.5.5.5 → CIDR match
Query phishing.evil.com → Pattern match
Query malware.com → Exact match
This makes Matchy databases very versatile.
Entry Limits
Practical limits (depends on available memory):
- IP addresses: Millions
- CIDR ranges: Millions
- Patterns: Tens of thousands (automaton size grows)
- Exact strings: Millions
Performance degrades gracefully as databases grow. Most applications use thousands to tens of thousands of entries.
Examples by Tool
Adding entries:
Querying entries:
Next Steps
- Pattern Matching - Glob syntax and advanced patterns
- Data Types and Values - Storing data with entries
- Performance Considerations - Optimizing for your use case
Pattern Matching
Matchy uses glob patterns for flexible string matching. This chapter explains pattern syntax and matching rules.
Glob Syntax
Asterisk (*)
Matches zero or more of any character.
Pattern: *.example.com matches foo.example.com, bar.example.com
Question Mark (?)
Matches exactly one character.
Pattern: test-? matches test-1, test-a but not test-ab
Character Sets ([abc])
Matches one character from the set.
Pattern: test-[abc].com matches test-a.com, test-b.com, test-c.com
Negated Sets ([!abc])
Matches one character NOT in the set.
Ranges ([a-z], [0-9])
Matches one character in the range.
Case Sensitivity
Matching behavior depends on the match mode set when building the database.
CaseInsensitive (recommended): *.Example.COM matches foo.example.com
CaseSensitive: Must match exact case
Common Patterns
Domain suffixes: *.example.com, *.*.example.com
URL patterns: http://*/admin/*
Flexible matching: malware-*, *-[0-9][0-9][0-9]
Performance
Patterns use Aho-Corasick algorithm - all patterns searched simultaneously. Typical: 1-2 microseconds for 50,000 patterns.
See Entry Types and Performance Considerations for more details.
Data Types and Values
Matchy stores structured data values with each entry. This chapter explains the supported data types.
Supported Types
String
Text values of any length.
Numbers
- Unsigned integers (uint16, uint32, uint64, uint128)
- Signed integers (int32)
- Floating point (float, double)
Boolean
True or false values.
Arrays
Ordered lists of values (can contain mixed types).
Maps
Key-value pairs (like JSON objects or hash maps).
Null
Explicit null/missing value.
Tool-Specific Representations
How you specify data types depends on your tool:
CLI: Use JSON notation in CSV/JSON files
key,data
192.0.2.1,"{""threat"": ""high"", ""score"": 95}"
Rust API: Use the DataValue enum
#![allow(unused)]
fn main() {
use matchy::DataValue;
data.insert("score".to_string(), DataValue::Uint32(95));
}
C API: Use JSON strings
matchy_builder_add(builder, "192.0.2.1", "{\"score\": 95}");
See tool-specific docs for complete details:
Nested Data
Maps and arrays can be nested to arbitrary depth:
{
"threat": {
"level": "high",
"categories": ["malware", "c2"],
"metadata": {
"first_seen": "2024-01-15",
"confidence": 0.95
}
}
}
Size Limits
Data is stored in compact binary format. Practical limits:
- Strings: Megabytes per string
- Arrays: Thousands of elements
- Maps: Thousands of keys
- Nesting: Dozens of levels deep
Most use cases store kilobytes per entry.
Next Steps
- Database Concepts - How data is stored
- Performance Considerations - Data size impact
Query Result Caching
Matchy includes a built-in LRU (Least Recently Used) cache for query results, providing 2-10x performance improvements for workloads with repeated queries.
Overview
The cache stores query results in memory, eliminating the need to re-execute database lookups for previously seen queries. This is particularly valuable for:
- Web APIs serving repeated requests
- Firewalls checking the same IPs frequently
- Real-time threat detection with hot patterns
- High-traffic services with predictable query patterns
Performance
Cache performance depends on the hit rate (percentage of queries found in cache):
| Hit Rate | Speedup vs Uncached | Use Case |
|---|---|---|
| 0% | 1.0x (no benefit) | Batch processing, unique queries |
| 50% | 1.5-2x | Mixed workload |
| 80% | 3-5x | Web API, typical firewall |
| 95% | 5-8x | High-traffic service |
| 99% | 8-10x | Repeated pattern checking |
Zero overhead when disabled: The cache uses compile-time optimization, so disabling it has no performance cost.
Configuration
Enabling the Cache
Use the builder API to configure cache capacity:
#![allow(unused)]
fn main() {
use matchy::Database;
// Enable cache with 10,000 entry capacity
let db = Database::from("threats.mxy")
.cache_capacity(10_000)
.open()?;
// Use the database normally - caching is transparent
if let Some(result) = db.lookup("evil.com")? {
println!("Match: {:?}", result);
}
}
Disabling the Cache
Explicitly disable caching for memory-constrained environments:
#![allow(unused)]
fn main() {
let db = Database::from("threats.mxy")
.no_cache() // Disable caching
.open()?;
}
Default behavior: If you don’t specify cache configuration, a reasonable default cache is enabled.
Cache Management
Inspecting Cache Size
Check how many entries are currently cached:
#![allow(unused)]
fn main() {
println!("Cache entries: {}", db.cache_size());
}
Clearing the Cache
Clear all cached entries:
#![allow(unused)]
fn main() {
db.clear_cache();
println!("Cache cleared: {}", db.cache_size()); // 0
}
This is useful for:
- Memory management in long-running processes
- Testing with fresh cache state
- Resetting after configuration changes
How It Works
The cache is an LRU (Least Recently Used) cache:
- On first query: Result is computed and stored in cache
- On repeated query: Result is returned from cache (fast!)
- When cache is full: Least recently used entry is evicted
The cache is thread-safe using interior mutability, so multiple queries can safely share the same Database instance.
Cache Capacity Guidelines
Choose cache capacity based on your workload:
| Workload | Recommended Capacity | Reasoning |
|---|---|---|
| Web API (< 1000 req/s) | 1,000 - 10,000 | Covers hot patterns |
| Firewall (medium traffic) | 10,000 - 50,000 | Covers recent IPs |
| High-traffic service | 50,000 - 100,000 | Maximize hit rate |
| Memory-constrained | Disable cache | Save memory |
Memory usage: Each cache entry uses ~100-200 bytes, so:
- 10,000 entries ≈ 1-2 MB
- 100,000 entries ≈ 10-20 MB
When to Use Caching
✅ Use Caching For:
- Web APIs with repeated queries
- Firewalls checking the same IPs
- Real-time monitoring with hot patterns
- Long-running services with predictable queries
❌ Skip Caching For:
- Batch processing (all queries unique)
- One-time scans (no repeated queries)
- Memory-constrained environments
- Testing where you need fresh results
Example: Web API with Caching
#![allow(unused)]
fn main() {
use matchy::Database;
use std::sync::Arc;
// Create a shared database with caching
let db = Arc::new(
Database::from("threats.mxy")
.cache_capacity(50_000) // High capacity for web API
.open()?
);
// Share across request handlers
let db_clone = Arc::clone(&db);
tokio::spawn(async move {
// Handle requests
loop {
let query = receive_request().await;
// Cache hit on repeated queries!
if let Some(result) = db_clone.lookup(&query)? {
send_response(result).await;
}
}
});
}
Benchmarking Cache Performance
Use the provided benchmark to measure cache performance on your workload:
# Run the cache demo
cargo run --release --example cache_demo
# Or run the comprehensive benchmark
cargo bench --bench cache_bench
See examples/cache_demo.rs for a complete working example.
Comparison with No Cache
Here’s a typical performance comparison:
#![allow(unused)]
fn main() {
// Without cache (baseline)
let db_uncached = Database::from("db.mxy").no_cache().open()?;
// 10,000 queries: 2.5s → 4,000 QPS
// With cache (80% hit rate)
let db_cached = Database::from("db.mxy").cache_capacity(10_000).open()?;
// 10,000 queries: 0.8s → 12,500 QPS (3x faster!)
}
Summary
- Simple configuration: Just add
.cache_capacity(size)to the builder - Transparent operation: No code changes after configuration
- Significant speedup: 2-10x for high hit rates
- Zero overhead: No cost when disabled
- Thread-safe: Safe to share across threads
Query result caching is one of the easiest ways to improve Matchy performance for real-world workloads.
Auto-Reload and Callbacks
Matchy supports automatic database reloading when files change, enabling zero-downtime updates in production systems. The auto-reload feature uses lock-free Arc swapping for minimal performance overhead.
Quick Start
Rust API
#![allow(unused)]
fn main() {
use matchy::Database;
// Enable auto-reload
let db = Database::from("threats.mxy")
.watch()
.open()?;
// Queries automatically use the latest database version
let result = db.lookup("192.168.1.1")?;
}
C API
#include <matchy/matchy.h>
// Configure auto-reload
matchy_open_options_t opts;
matchy_init_open_options(&opts);
opts.auto_reload = true;
matchy_t *db = matchy_open_with_options("threats.mxy", &opts);
// Queries automatically use latest version
matchy_result_t result;
matchy_lookup(db, "192.168.1.1", &result);
matchy_close(db);
How Auto-Reload Works
When auto-reload is enabled:
- File watching - A background thread monitors the database file using OS notifications
- Debouncing - File changes are debounced (200ms) to avoid rapid reload cycles
- Background loading - New database is loaded in a background thread
- Atomic swap - New database is atomically swapped using lock-free Arc pointer
- Graceful handoff - Old database stays alive until all query threads finish with it
┌─────────────┐
│ Query Thread│
│ Thread 1 │──┐
└─────────────┘ │
│ ┌──────────────┐ ┌──────────────┐
┌─────────────┐ ├───→│ ArcSwap │────→│ Database v1 │
│ Query Thread│ │ │ (atomic ptr) │ └──────────────┘
│ Thread 2 │──┤ └──────────────┘ │
└─────────────┘ │ ▲ │
│ │ │ (stays alive
┌─────────────┐ │ ┌──────────────┐ │ until all
│ Query Thread│ │ │ Watcher │ │ refs drop)
│ Thread N │──┘ │ Thread │ │
└─────────────┘ └──────────────┘ ▼
│ ┌──────────────┐
│ (atomic │ Database v2 │
└─ swap) │ (new) │
└──────────────┘
Performance
Auto-reload has minimal overhead:
- Per-query cost: ~1-2 nanoseconds (atomic generation counter check)
- After first check: Zero overhead (thread-local Arc caching)
- No locks: Pure lock-free atomic operations
- Scalability: No contention even with 160+ cores
Performance Breakdown
#![allow(unused)]
fn main() {
// First query after reload (~1-2ns overhead)
let result = db.lookup("192.168.1.1")?; // Check generation + cache Arc
// Subsequent queries (zero overhead)
let result = db.lookup("192.168.1.2")?; // Pure thread-local access
let result = db.lookup("192.168.1.3")?; // Pure thread-local access
}
The generation check is a single atomic load operation, comparable to checking a boolean flag.
Reload Callbacks
Get notified when database reloads occur:
Rust API
#![allow(unused)]
fn main() {
use matchy::{Database, ReloadEvent};
let db = Database::from("threats.mxy")
.watch()
.on_reload(|event: ReloadEvent| {
if event.success {
println!("✅ Database reloaded successfully");
println!(" Path: {}", event.path.display());
println!(" Generation: {}", event.generation);
} else {
eprintln!("❌ Database reload failed");
eprintln!(" Path: {}", event.path.display());
eprintln!(" Error: {}", event.error.unwrap());
}
})
.open()?;
}
The ReloadEvent structure contains:
#![allow(unused)]
fn main() {
pub struct ReloadEvent {
pub path: PathBuf, // Database file path
pub success: bool, // Whether reload succeeded
pub error: Option<String>, // Error message (if failed)
pub generation: u64, // Generation counter
pub source: ReloadSource, // What triggered the reload
}
}
C API
#include <matchy/matchy.h>
#include <stdio.h>
// Callback function
void on_reload(const matchy_reload_event_t *event, void *user_data) {
if (event->success) {
printf("✅ Reloaded: %s (generation %lu)\n",
event->path, event->generation);
} else {
fprintf(stderr, "❌ Reload failed: %s - %s\n",
event->path, event->error);
}
}
int main() {
// Configure callback
matchy_open_options_t opts;
matchy_init_open_options(&opts);
opts.auto_reload = true;
opts.reload_callback = on_reload;
opts.reload_callback_user_data = NULL; // Optional context pointer
matchy_t *db = matchy_open_with_options("threats.mxy", &opts);
// ... use database ...
matchy_close(db);
return 0;
}
Callback Safety
Important considerations:
- Callbacks run on the watcher thread, not query threads
- Keep callbacks fast and non-blocking
- Do not call matchy query functions from callbacks (potential deadlock)
- Copy
event.pathandevent.errorif you need them after callback returns - Callbacks must be thread-safe
Use Cases
Production Threat Intelligence
#![allow(unused)]
fn main() {
// Threat database updated hourly from feed
let db = Database::from("/data/threats.mxy")
.watch()
.on_reload(|event| {
if event.success {
// Log to monitoring system
metrics::increment_counter!("db_reload_success");
info!("Threat database updated: generation {}", event.generation);
} else {
// Alert on failure
metrics::increment_counter!("db_reload_failure");
error!("Failed to reload threats: {:?}", event.error);
}
})
.open()?;
// Queries automatically use latest threat data
for log_entry in log_stream {
if let Some(threat) = db.lookup(&log_entry.ip)? {
alert_security_team(log_entry, threat);
}
}
}
GeoIP Database Updates
#![allow(unused)]
fn main() {
// GeoIP database refreshed weekly
let geoip = Database::from("/data/GeoLite2-City.mmdb")
.watch()
.on_reload(|event| {
println!("GeoIP database updated: {}", event.path.display());
})
.open()?;
// No service restart needed for updates
let location = geoip.lookup("8.8.8.8")?;
}
Multi-Process Deployment
#![allow(unused)]
fn main() {
// Worker process
let db = Arc::new(
Database::from("threats.mxy")
.watch()
.open()?
);
// Spawn multiple worker threads
for i in 0..num_cpus::get() {
let db_clone = Arc::clone(&db);
thread::spawn(move || {
// Each thread automatically gets reloaded database
loop {
let work = get_work();
let result = db_clone.lookup(&work.query)?;
process_result(result);
}
});
}
}
HTTP Auto-Update
Matchy supports automatic updates for databases that include an embedded update URL. The database uses this internal metadata to periodically check for updates and download them if changed.
Rust API
#![allow(unused)]
fn main() {
// Database must have embedded update URL (from DatabaseBuilder::with_update_url())
let db = Database::from("threats.mxy")
.auto_update() // No URL parameter - uses embedded metadata
.update_interval(Duration::from_secs(3600))
.cache_dir("/var/cache/myapp") // Optional: defaults to ~/.cache/matchy/
.on_reload(|event| {
match event.source {
ReloadSource::FileChange => println!("Local file changed"),
ReloadSource::NetworkUpdate => println!("Downloaded new version"),
}
})
.open()?; // Returns error if database has no embedded URL
}
The auto-update feature:
- Self-describing: Uses URL embedded in the database file (set during build)
- Safe updates: Downloads to a cache directory (
~/.cache/matchy/by default), never overwriting the original file - Composable: Can be combined with
watch()to handle both local replacements and network updates - Efficient: Uses ETag and Last-Modified headers to avoid unnecessary downloads
- Robust: Validates the database before swapping
C API
matchy_open_options_t opts;
matchy_init_open_options(&opts);
// Enable auto-update (requires embedded URL in database)
opts.auto_update = true;
// Optional: set custom download location (formerly update_url)
opts.cache_dir = "/var/cache/myapp";
matchy_t *db = matchy_open_with_options("threats.mxy", &opts);
Database Update Best Practices
Atomic File Replacement
Always use atomic rename for updates:
# Build new database
matchy build new-threats.csv -o threats.mxy.tmp
# Atomic rename (works on all platforms)
mv threats.mxy.tmp threats.mxy
This ensures:
- No partial database reads
- Auto-reload detects the change
- Zero query errors during update
Update Scripts
#!/bin/bash
# update-threats.sh - Safe database update script
set -e
DB_PATH="/data/threats.mxy"
TEMP_DB="${DB_PATH}.tmp"
# Download and build new database
curl -o threats.csv "https://threat-feed.example.com/latest"
matchy build threats.csv -o "$TEMP_DB"
# Validate before deploying
matchy validate "$TEMP_DB" --level strict
# Atomic replace
mv "$TEMP_DB" "$DB_PATH"
echo "✅ Database updated successfully"
Monitoring Reloads
#![allow(unused)]
fn main() {
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
let reload_count = Arc::new(AtomicU64::new(0));
let reload_count_clone = Arc::clone(&reload_count);
let db = Database::from("threats.mxy")
.watch()
.on_reload(move |event| {
if event.success {
reload_count_clone.fetch_add(1, Ordering::Relaxed);
}
})
.open()?;
// Later: check reload metrics
let reloads = reload_count.load(Ordering::Relaxed);
println!("Database has been reloaded {} times", reloads);
}
Limitations
File System Events
- Linux: Uses inotify (requires kernel support)
- macOS: Uses FSEvents (works with atomic renames)
- Windows: Uses ReadDirectoryChangesW
- Network filesystems: May have delayed notifications (NFS, CIFS, etc.)
Debouncing
File changes are debounced for 200ms to avoid rapid reload cycles. If your build process writes the file in multiple stages, only the final change triggers reload.
Memory Usage
During reload, both old and new databases are in memory briefly:
Normal: 1x database size
Reload: 2x database size (temporary)
Old database is freed once all query threads release their references (typically <1 second).
Troubleshooting
Reload Not Triggering
Check file watcher:
#![allow(unused)]
fn main() {
// Enable debug logging
RUST_LOG=matchy=debug cargo run
}
Verify file changes:
# Check file modification time
stat threats.mxy
# Force update
touch threats.mxy
Callbacks Not Firing
Ensure callback is set before database changes:
#![allow(unused)]
fn main() {
// ❌ Wrong: callback set after database loaded
let db = Database::from("threats.mxy").watch().open()?;
// Database changes here won't trigger callback yet
// ✅ Correct: callback set during open
let db = Database::from("threats.mxy")
.watch()
.on_reload(|e| println!("Reloaded!"))
.open()?;
}
Performance Impact
If auto-reload overhead is too high:
#![allow(unused)]
fn main() {
// Measure overhead
let start = Instant::now();
for i in 0..1_000_000 {
db.lookup("192.168.1.1")?;
}
println!("Time: {:?}", start.elapsed());
}
Expected: <1-2ns per query overhead. If higher, check for:
- Very high query rate (>100M QPS per thread)
- NUMA architecture with poor cache affinity
- Excessive reloads (reduce update frequency)
Next Steps
- Performance Considerations - Detailed performance analysis
- Query Result Caching - Combine with caching for maximum throughput
- Examples - Complete working examples
Pattern Extraction
Matchy includes a high-performance pattern extractor for finding domains, IP addresses (IPv4 and IPv6), email addresses, and file hashes (MD5, SHA1, SHA256, SHA384) in unstructured text like log files.
Overview
The Extractor uses SIMD-accelerated algorithms to scan text and extract patterns at 200-500 MB/sec. This is useful for:
- Log scanning: Find domains/IPs in access logs, firewall logs, etc.
- Threat detection: Extract indicators from security logs
- Analytics: Count unique domains/IPs in large datasets
- Compliance: Find email addresses or PII in audit logs
- Forensics: Extract patterns from binary logs
Quick Start
#![allow(unused)]
fn main() {
use matchy::extractor::Extractor;
let extractor = Extractor::new()?;
let log_line = b"2024-01-15 GET /api evil.example.com 192.168.1.1";
for match_item in extractor.extract_from_line(log_line) {
println!("Found: {}", match_item.as_str(log_line));
}
// Output:
// Found: evil.example.com
// Found: 192.168.1.1
}
Supported Patterns
Domains
Extracts fully qualified domain names with TLD validation:
#![allow(unused)]
fn main() {
let line = b"Visit api.example.com or https://www.github.com/path";
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Domain(domain) = match_item.item {
println!("Domain: {}", domain);
}
}
// Output:
// Domain: api.example.com
// Domain: www.github.com
}
Features:
- TLD validation: 10K+ real TLDs from Public Suffix List
- Unicode support: Handles münchen.de, café.fr (both UTF-8 and punycode)
- Subdomain extraction: Extracts full domain from URLs
- Word boundaries: Avoids false positives in non-domain text
IPv4 Addresses
Extracts all valid IPv4 addresses:
#![allow(unused)]
fn main() {
let line = b"Traffic from 10.0.0.5 to 172.16.0.10";
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Ipv4(ip) = match_item.item {
println!("IP: {}", ip);
}
}
// Output:
// IP: 10.0.0.5
// IP: 172.16.0.10
}
Features:
- SIMD-accelerated: Uses
memchrfor fast dot detection - Validation: Rejects invalid IPs (256.1.1.1, 999.0.0.1)
- Word boundaries: Avoids false matches in version numbers
IPv6 Addresses
Extracts all valid IPv6 addresses:
#![allow(unused)]
fn main() {
let line = b"Server at 2001:db8::1 responded from fe80::1";
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Ipv6(ip) = match_item.item {
println!("IPv6: {}", ip);
}
}
// Output:
// IPv6: 2001:db8::1
// IPv6: fe80::1
}
Features:
- SIMD-accelerated: Uses
memchrfor fast colon detection - Compressed notation: Handles
::and full addresses - Validation: Full RFC 4291 compliance via Rust’s
Ipv6Addr - Mixed notation: Supports
::ffff:127.0.0.1format
Email Addresses
Extracts RFC 5322-compliant email addresses:
#![allow(unused)]
fn main() {
let line = b"Contact alice@example.com or bob+tag@company.org";
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Email(email) = match_item.item {
println!("Email: {}", email);
}
}
// Output:
// Email: alice@example.com
// Email: bob+tag@company.org
}
Features:
- Plus addressing: Supports user+tag@example.com
- Subdomain validation: Checks domain part for valid TLD
File Hashes
Extracts MD5, SHA1, and SHA256 file hashes:
#![allow(unused)]
fn main() {
use matchy::extractor::{ExtractedItem, HashType};
let line = b"malware.exe MD5=5d41402abc4b2a76b9719d911017c592 detected";
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Hash(hash_type, hash) = match_item.item {
let type_str = match hash_type {
HashType::Md5 => "MD5",
HashType::Sha1 => "SHA1",
HashType::Sha256 => "SHA256",
};
println!("{}: {}", type_str, hash);
}
}
// Output:
// MD5: 5d41402abc4b2a76b9719d911017c592
}
Features:
- Boundary distance detection: Finds tokens of exact length (32/40/64 hex chars)
- SIMD hex validation: Auto-vectorized lookup table for blazing speed
- Case insensitive: Accepts both lowercase and uppercase hex
- Zero false positives: Rejects UUIDs (with dashes) and non-hex strings
- High throughput: ~1-2 GB/sec processing speed
Supported hash types:
- MD5: 32 hex characters (e.g.,
5d41402abc4b2a76b9719d911017c592) - SHA1: 40 hex characters (e.g.,
2fd4e1c67a2d28fced849ee1bb76e7391b93eb12) - SHA256: 64 hex characters (e.g.,
2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae) - SHA384: 96 hex characters (e.g.,
cb00753f45a35e8bb5a03d699ac65007272c32ab0eded1631a8b605a43ff5bed8086072ba1e7cc2358baeca134c825a7)
Configuration
Customize extraction behavior using the builder pattern:
#![allow(unused)]
fn main() {
use matchy::extractor::Extractor;
let extractor = Extractor::builder()
.extract_domains(true) // Enable domain extraction
.extract_ipv4(true) // Enable IPv4 extraction
.extract_ipv6(true) // Enable IPv6 extraction
.extract_emails(false) // Disable email extraction
.min_domain_labels(3) // Require 3+ labels (api.test.com)
.require_word_boundaries(true) // Enforce word boundaries
.build()?;
}
Configuration Options
| Option | Default | Description |
|---|---|---|
extract_domains | true | Extract domain names |
extract_ipv4 | true | Extract IPv4 addresses |
extract_ipv6 | true | Extract IPv6 addresses |
extract_emails | true | Extract email addresses |
extract_hashes | true | Extract file hashes (MD5, SHA1, SHA256, SHA384) |
min_domain_labels | 2 | Minimum labels (2 = example.com, 3 = api.example.com) |
require_word_boundaries | true | Ensure patterns have word boundaries |
Unicode and IDN Support
The extractor handles Unicode domains automatically:
#![allow(unused)]
fn main() {
let line = "Visit münchen.de or café.fr".as_bytes();
for match_item in extractor.extract_from_line(line) {
if let ExtractedItem::Domain(domain) = match_item.item {
println!("Unicode domain: {}", domain);
}
}
// Output:
// Unicode domain: münchen.de
// Unicode domain: café.fr
}
How it works:
- Extracts Unicode text as-is
- Validates TLD using punycode conversion internally
- Returns original Unicode form (not punycode)
Binary Log Support
The extractor can find ASCII patterns in binary data:
#![allow(unused)]
fn main() {
let mut binary_log = Vec::new();
binary_log.extend_from_slice(b"Log: ");
binary_log.push(0xFF); // Invalid UTF-8
binary_log.extend_from_slice(b" evil.com ");
for match_item in extractor.extract_from_line(&binary_log) {
println!("Found in binary: {}", match_item.as_str(&binary_log));
}
// Output:
// Found in binary: evil.com
}
This is useful for scanning:
- Binary protocol logs
- Corrupted text files
- Mixed encoding logs
Performance
The extractor is highly optimized:
- Throughput: 200-500 MB/sec on typical log files
- SIMD acceleration: Uses
memchrfor byte scanning - Zero-copy: No string allocation until match
- Lazy UTF-8 validation: Only validates matched patterns
Performance Tips
-
Disable unused extractors to reduce overhead:
#![allow(unused)] fn main() { let extractor = Extractor::builder() .extract_ipv4(true) // Only extract IPv4 .extract_ipv6(true) // Only extract IPv6 .extract_domains(false) .extract_emails(false) .build()?; } -
Process line-by-line for better memory usage:
#![allow(unused)] fn main() { for line in BufReader::new(file).lines() { for match_item in extractor.extract_from_line(line?.as_bytes()) { // Process match } } } -
Use byte slices to avoid UTF-8 conversion:
#![allow(unused)] fn main() { // Fast: no UTF-8 validation on whole line extractor.extract_from_line(line_bytes) // Slower: validates entire line as UTF-8 first extractor.extract_from_line(line_str.as_bytes()) }
Combining with Database Lookups
After extracting patterns, you typically want to look them up in a database. Use lookup_extracted() for a clean, efficient API:
#![allow(unused)]
fn main() {
use matchy::{Database, extractor::Extractor};
let db = Database::from("threats.mxy").open()?;
let extractor = Extractor::new()?;
let log_line = b"Traffic from 192.168.1.100 to evil.com";
for item in extractor.extract_from_line(log_line) {
if let Some(result) = db.lookup_extracted(&item, log_line)? {
println!("⚠️ Match: {} ({})",
item.as_str(log_line),
item.item.type_name()
);
}
}
}
See the Querying guide for complete details on the extract-and-lookup pattern.
CLI Integration
The matchy match command uses the extractor internally:
# Scan logs for threats (outputs JSON to stdout)
matchy match threats.mxy access.log
# Each match is a JSON line:
# {"timestamp":"123.456","line_number":1,"matched_text":"evil.com","match_type":"pattern",...}
# {"timestamp":"123.789","line_number":2,"matched_text":"1.2.3.4","match_type":"ip",...}
# Show statistics (to stderr)
matchy match threats.mxy access.log --stats
# Statistics output (stderr):
# [INFO] Lines processed: 15,234
# [INFO] Lines with matches: 127 (0.8%)
# [INFO] Throughput: 450.23 MB/s
See matchy match for CLI details.
Examples
Complete working examples:
examples/extractor_demo.rs: Demonstrates all extraction featuressrc/bin/matchy.rs: Seecmd_match()for CLI implementation
Run the demo:
cargo run --release --example extractor_demo
Summary
- High performance: 200-500 MB/sec throughput
- SIMD-accelerated: Fast pattern finding
- Unicode support: Handles international domains
- Binary logs: Extracts ASCII from non-UTF-8
- Zero-copy: Efficient memory usage
- Configurable: Customize extraction behavior
Pattern extraction makes it easy to scan large log files and find security indicators.
MMDB Compatibility
Matchy can read standard MaxMind MMDB files and extends the format to support string and pattern matching.
Reading MMDB Files
MaxMind’s GeoIP databases use the MMDB format. Matchy can read these files directly:
#![allow(unused)]
fn main() {
use matchy::Database;
// Open a MaxMind GeoLite2 database
let db = Database::open("GeoLite2-City.mmdb")?;
// Query an IP address
match db.lookup("8.8.8.8")? {
Some(result) => {
println!("Location data: {:?}", result);
}
None => println!("IP not found"),
}
}
The same works from the CLI:
$ matchy query GeoLite2-City.mmdb 8.8.8.8
Found: IP address 8.8.8.8/32
country: "US"
city: "Mountain View"
coordinates: [37.386, -122.0838]
MMDB Format Overview
MMDB files contain:
- IP tree - Binary trie mapping IP addresses to data
- Data section - Structured data storage (strings, numbers, maps, arrays)
- Metadata - Database information (build time, version, etc.)
This is a compact, binary format designed for fast IP address lookups.
Matchy Extensions
Matchy extends MMDB with additional sections:
Standard MMDB
┌──────────────────────────────┐
│ IP Tree │ IPv4 and IPv6 lookup
├──────────────────────────────┤
│ Data Section │ Structured data
├──────────────────────────────┤
│ Metadata │ Database info
└──────────────────────────────┘
Matchy Extended Format
┌─────────────────────────────────────────────────┐
│ IP Tree │ IPv4 and IPv6 (MMDB compatible)
├─────────────────────────────────────────────────┤
│ Data Section │ Structured data (MMDB compatible)
├─────────────────────────────────────────────────┤
│ Hash Table │ Exact string matches (Matchy extension)
├─────────────────────────────────────────────────┤
│ AC Automaton │ Pattern matching (Matchy extension)
├─────────────────────────────────────────────────┤
│ Metadata │ Database info
└─────────────────────────────────────────────────┘
The IP tree and data section remain fully compatible with standard MMDB readers.
Compatibility Guarantees
Reading MMDB files:
- ✅ Matchy can read any standard MMDB file
- ✅ IP lookups work exactly as expected
- ✅ GeoIP, ASN, and other MaxMind databases supported
Writing Matchy databases:
- ✅ Standard MMDB readers can read the IP portion
- ⚠️ String and pattern extensions are ignored by standard readers
- ✅ Matchy databases work with Matchy tools (CLI and APIs)
Practical Examples
Using GeoIP Databases
MaxMind provides free GeoLite2 databases. Download and use them directly:
$ wget https://example.com/GeoLite2-City.mmdb
$ matchy query GeoLite2-City.mmdb 1.1.1.1
From Rust:
#![allow(unused)]
fn main() {
let db = Database::open("GeoLite2-City.mmdb")?;
if let Some(result) = db.lookup("1.1.1.1")? {
// Access location data
println!("Result: {:?}", result);
}
}
Extending MMDB Files
You can build a database that combines IP data (MMDB compatible) with patterns (Matchy extension):
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchMode, DataValue};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Add IP data (MMDB compatible)
let mut ip_data = HashMap::new();
ip_data.insert("country".to_string(), DataValue::String("US".to_string()));
builder.add_entry("8.8.8.8", ip_data)?;
// Add pattern data (Matchy extension)
let mut pattern_data = HashMap::new();
pattern_data.insert("category".to_string(), DataValue::String("search".to_string()));
builder.add_entry("*.google.com", pattern_data)?;
let db_bytes = builder.build()?;
std::fs::write("extended.mxy", &db_bytes)?;
}
Standard MMDB readers will see the IP data. Matchy tools will see both IP and pattern data.
File Format Details
MMDB files are binary and consist of:
- IP Tree: Binary trie where each node represents a network bit
- Data Section: Compact binary encoding of values
- Metadata: JSON with database information
Matchy preserves this structure and adds:
- Hash Table: For O(1) exact string lookups
- Aho-Corasick Automaton: For simultaneous pattern matching
See Binary Format Specification for complete details.
Version Compatibility
Matchy supports:
- MMDB format version 2.x (current standard)
- IPv4 and IPv6 address families
- All MMDB data types (strings, integers, floats, maps, arrays)
When building databases, Matchy uses MMDB format 2.0 for the IP tree and data section.
Performance Comparison
MMDB lookups in Matchy have similar performance to MaxMind’s official libraries:
MaxMind libmaxminddb: ~5-10 million IP lookups/second
Matchy IP lookups: ~7 million IP lookups/second
Both use:
- Binary tree traversal (O(log n) worst case, O(32) for IPv4, O(128) for IPv6)
- Memory mapping for instant loading
- Zero-copy data access
The extensions (hash table and pattern matching) add minimal overhead to IP lookups.
Migration from libmaxminddb
If you’re using MaxMind’s C library (libmaxminddb), Matchy provides similar functionality:
libmaxminddb:
MMDB_s mmdb;
MMDB_open("GeoLite2-City.mmdb", 0, &mmdb);
int gai_error, mmdb_error;
MMDB_lookup_result_s result =
MMDB_lookup_string(&mmdb, "8.8.8.8", &gai_error, &mmdb_error);
Matchy:
matchy_t *db = matchy_open("GeoLite2-City.mmdb");
matchy_result_t result = matchy_query(db, "8.8.8.8");
Both load the database via memory mapping and provide similar query performance.
Next Steps
- Binary Format Specification - Detailed format docs
- Performance Considerations - Optimization strategies
- Entry Types - Understanding all entry types
Migrating from libmaxminddb
Matchy provides a compatibility layer that implements the libmaxminddb API on top of matchy’s engine. Most existing libmaxminddb applications can switch to matchy with minimal code changes.
Quick Start
Before (libmaxminddb)
#include <maxminddb.h>
// Compile: gcc -o app app.c -lmaxminddb
After (matchy)
#include <matchy/maxminddb.h>
// Compile: gcc -o app app.c -lmatchy
That’s it! Most applications will work with just these changes.
Why Migrate?
Benefits of switching to matchy:
- Unified database format: IP addresses + string patterns + exact strings in one file
- Better performance: Faster loads, optimized queries
- Memory-mapped by default: Instant startup times
- Active development: Modern codebase in Rust
- Drop-in compatibility: Minimal code changes required
Migration Steps
1. Update Include Path
Before:
#include <maxminddb.h>
After:
#include <matchy/maxminddb.h>
2. Update Linker Flags
Before:
gcc -o myapp myapp.c -lmaxminddb
After:
gcc -o myapp myapp.c -I/path/to/matchy/include -L/path/to/matchy/lib -lmatchy
Or with pkg-config:
gcc -o myapp myapp.c $(pkg-config --cflags --libs matchy)
3. Recompile
The compatibility layer is API compatible but NOT binary compatible. You must recompile your application.
make clean
make
4. Test
Your existing .mmdb files should work without modification:
./myapp /path/to/GeoLite2-City.mmdb
Complete Example
Original libmaxminddb Code
#include <maxminddb.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <database> <ip>\n", argv[0]);
exit(1);
}
const char *database = argv[1];
const char *ip_address = argv[2];
MMDB_s mmdb;
int status = MMDB_open(database, MMDB_MODE_MMAP, &mmdb);
if (status != MMDB_SUCCESS) {
fprintf(stderr, "Can't open %s: %s\n",
database, MMDB_strerror(status));
exit(1);
}
int gai_error, mmdb_error;
MMDB_lookup_result_s result = MMDB_lookup_string(
&mmdb, ip_address, &gai_error, &mmdb_error);
if (gai_error != 0) {
fprintf(stderr, "Error from getaddrinfo: %s\n",
gai_strerror(gai_error));
exit(1);
}
if (mmdb_error != MMDB_SUCCESS) {
fprintf(stderr, "Lookup error: %s\n",
MMDB_strerror(mmdb_error));
exit(1);
}
if (result.found_entry) {
MMDB_entry_data_s entry_data;
// Get country ISO code
status = MMDB_get_value(&result.entry, &entry_data,
"country", "iso_code", NULL);
if (status == MMDB_SUCCESS && entry_data.has_data &&
entry_data.type == MMDB_DATA_TYPE_UTF8_STRING) {
printf("%.*s\n", entry_data.data_size, entry_data.utf8_string);
}
} else {
printf("No entry found for %s\n", ip_address);
}
MMDB_close(&mmdb);
return 0;
}
Migrated to Matchy
#include <matchy/maxminddb.h> // Only change: include path
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <database> <ip>\n", argv[0]);
exit(1);
}
const char *database = argv[1];
const char *ip_address = argv[2];
MMDB_s mmdb;
int status = MMDB_open(database, MMDB_MODE_MMAP, &mmdb);
if (status != MMDB_SUCCESS) {
fprintf(stderr, "Can't open %s: %s\n",
database, MMDB_strerror(status));
exit(1);
}
int gai_error, mmdb_error;
MMDB_lookup_result_s result = MMDB_lookup_string(
&mmdb, ip_address, &gai_error, &mmdb_error);
if (gai_error != 0) {
fprintf(stderr, "Error from getaddrinfo: %s\n",
gai_strerror(gai_error));
exit(1);
}
if (mmdb_error != MMDB_SUCCESS) {
fprintf(stderr, "Lookup error: %s\n",
MMDB_strerror(mmdb_error));
exit(1);
}
if (result.found_entry) {
MMDB_entry_data_s entry_data;
// Get country ISO code
status = MMDB_get_value(&result.entry, &entry_data,
"country", "iso_code", NULL);
if (status == MMDB_SUCCESS && entry_data.has_data &&
entry_data.type == MMDB_DATA_TYPE_UTF8_STRING) {
printf("%.*s\n", entry_data.data_size, entry_data.utf8_string);
}
} else {
printf("No entry found for %s\n", ip_address);
}
MMDB_close(&mmdb);
return 0;
}
Differences: Only the #include line changed!
Compatibility Matrix
Fully Supported Functions
These functions work identically to libmaxminddb:
| Function | Status | Notes |
|---|---|---|
MMDB_open() | ✅ Full | Opens .mmdb files |
MMDB_close() | ✅ Full | Closes database |
MMDB_lookup_string() | ✅ Full | IP string lookup |
MMDB_lookup_sockaddr() | ✅ Full | sockaddr lookup |
MMDB_get_value() | ✅ Full | Navigate data structures |
MMDB_vget_value() | ✅ Full | va_list variant |
MMDB_aget_value() | ✅ Full | Array variant |
MMDB_get_entry_data_list() | ✅ Full | Full data traversal |
MMDB_free_entry_data_list() | ✅ Full | Free list |
MMDB_lib_version() | ✅ Full | Returns matchy version |
MMDB_strerror() | ✅ Full | Error messages |
Stub Functions (Not Implemented)
These rarely-used functions return errors:
| Function | Status | Notes |
|---|---|---|
MMDB_read_node() | ⚠️ Stub | Low-level tree access (rarely used) |
MMDB_dump_entry_data_list() | ⚠️ Stub | Debugging function (rarely used) |
MMDB_get_metadata_as_entry_data_list() | ⚠️ Stub | Metadata access (rarely used) |
If your application uses these functions, please open an issue.
Important Differences
1. Binary Compatibility
Not binary compatible - you must recompile your application.
The MMDB_s struct has a different internal layout:
// libmaxminddb (many internal fields)
typedef struct MMDB_s {
// ... many implementation details
} MMDB_s;
// matchy (simpler, wraps matchy handle)
typedef struct MMDB_s {
matchy_t *_matchy_db;
uint32_t flags;
const char *filename;
ssize_t file_size;
} MMDB_s;
Impact: Applications that directly access MMDB_s fields may break. Most applications only pass the pointer around and should be fine.
2. Threading Model
libmaxminddb: Thread-safe for reads after open
matchy: Also thread-safe for reads after open
Both libraries are safe to use from multiple threads for lookups. No changes needed.
3. Memory Mapping
libmaxminddb: Optional with MMDB_MODE_MMAP
matchy: Always memory-mapped (flag accepted but ignored)
Impact: Better performance! Databases load instantly regardless of size.
4. Error Codes
Matchy uses the same error code numbers and names. Error handling code should work unchanged:
if (status != MMDB_SUCCESS) {
fprintf(stderr, "Error: %s\n", MMDB_strerror(status));
}
Build System Updates
Makefile
Before:
CFLAGS = -Wall -O2
LIBS = -lmaxminddb
myapp: myapp.c
$(CC) $(CFLAGS) -o myapp myapp.c $(LIBS)
After:
CFLAGS = -Wall -O2 -I/usr/local/include
LIBS = -L/usr/local/lib -lmatchy
myapp: myapp.c
$(CC) $(CFLAGS) -o myapp myapp.c $(LIBS)
Or use pkg-config:
CFLAGS = -Wall -O2 $(shell pkg-config --cflags matchy)
LIBS = $(shell pkg-config --libs matchy)
myapp: myapp.c
$(CC) $(CFLAGS) -o myapp myapp.c $(LIBS)
CMake
Before:
find_package(MMDB REQUIRED)
target_link_libraries(myapp PRIVATE MMDB::MMDB)
After:
find_package(PkgConfig REQUIRED)
pkg_check_modules(MATCHY REQUIRED matchy)
target_include_directories(myapp PRIVATE ${MATCHY_INCLUDE_DIRS})
target_link_libraries(myapp PRIVATE ${MATCHY_LIBRARIES})
Autotools
Before:
./configure
make
After:
./configure CFLAGS="$(pkg-config --cflags matchy)" \
LDFLAGS="$(pkg-config --libs matchy)"
make
Testing Your Migration
1. Compile Test
gcc -o test_migration test.c \
-I/usr/local/include \
-L/usr/local/lib \
-lmatchy
./test_migration GeoLite2-City.mmdb 8.8.8.8
2. Functional Test
Verify results match libmaxminddb:
# With libmaxminddb
./old_binary database.mmdb 8.8.8.8 > old_output.txt
# With matchy
./new_binary database.mmdb 8.8.8.8 > new_output.txt
# Compare
diff old_output.txt new_output.txt
3. Performance Test
Matchy should be faster or comparable:
# Benchmark lookups
time ./myapp database.mmdb < ip_list.txt
Performance Considerations
Load Time
Both libraries use memory-mapping:
libmaxminddb:
- Uses memory-mapping when MMDB_MODE_MMAP is specified
- Load time depends on disk I/O and OS page cache state
matchy:
- Always memory-mapped
- Load time depends on disk I/O and OS page cache state
Impact: Similar load performance for IP lookups. Matchy’s main advantage is supporting additional data types (strings, patterns) in the same database.
Query Performance
For IP address lookups (what libmaxminddb does), both libraries have similar performance:
- Both use binary trie traversal
- Sub-microsecond latency typical
- Performance is comparable
Impact: Migration should not significantly affect IP lookup performance. Matchy’s benefits are in unified database format and additional query types.
Memory Usage
libmaxminddb: Memory-mapped when using MMAP mode, only active pages loaded
matchy: Memory-mapped, only active pages loaded
Impact: Similar memory footprint for IP-only databases.
Troubleshooting
Compilation Errors
Error: maxminddb.h: No such file or directory
Solution: Check include path:
gcc -I/usr/local/include/matchy ...
Error: undefined reference to MMDB_open
Solution: Add matchy library:
gcc ... -lmatchy
Runtime Errors
Error: ./myapp: error while loading shared libraries: libmatchy.so
Solution: Set library path:
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
Or install system-wide:
sudo ldconfig
Behavior Differences
Issue: Results differ slightly from libmaxminddb
Check:
- Are you using the same database file?
- Is the database corrupted? Try
matchy validate database.mmdb - Are there API usage differences?
Using Native Matchy Features
After migration, you can optionally use matchy-specific features:
Pattern Matching
Matchy databases can include string patterns:
// Use native matchy API alongside MMDB API
#include <matchy/maxminddb.h>
#include <matchy/matchy.h>
// IP lookup with MMDB API
MMDB_lookup_result_s result = MMDB_lookup_string(&mmdb, "8.8.8.8", ...);
// Pattern matching with matchy API
// Query with a string, database contains patterns like "*.google.com"
matchy_result_t *pattern_result = NULL;
matchy_lookup(db, "www.google.com", &pattern_result);
Building Enhanced Databases
Use matchy build to create databases with both IP and pattern data:
matchy build -i ips.csv -i patterns.csv -o enhanced.mxy
Then query with the MMDB compatibility API as usual.
FAQ
Q: Do I need to convert my .mmdb files?
A: No! Matchy reads standard .mmdb files directly.
Q: Can I use both libmaxminddb and matchy in the same project?
A: Not recommended. They have overlapping symbols. Choose one.
Q: Is matchy slower than libmaxminddb?
A: For IP address lookups, performance is similar - both use memory-mapped binary tries. Matchy’s advantage is supporting additional query types (patterns, strings) in a unified database format.
Q: What if a function I need isn’t implemented?
A: Please open an issue with your use case.
Q: Can I contribute MMDB compatibility improvements?
A: Yes! See Contributing.
Next Steps
After migration:
- ✅ Test thoroughly with your production data
- 📊 Benchmark to verify performance improvements
- 🎯 Explore matchy-specific features (patterns, validation)
- 📖 Read the C API Reference
- 🚀 Deploy with confidence
Getting Help
- Documentation: C API Reference
- Issues: Report bugs or request features
- Examples: See examples/
- Community: Join discussions
See Also
- C API Overview - Native matchy C API
- First Database with C - C tutorial
- MMDB Compatibility - Format compatibility details
Performance Considerations
This chapter covers performance characteristics and optimization strategies for Matchy databases.
Query Performance
Different entry types have different performance characteristics:
IP Address Lookups
Speed: ~7 million queries/second Algorithm: Binary tree traversal Complexity: O(32) for IPv4, O(128) for IPv6 (address bit length)
$ matchy bench database.mxy
IP address lookups: 7,234,891 queries/sec (138ns avg)
IP lookups traverse a binary trie, checking one bit at a time. The depth is fixed at 32 bits (IPv4) or 128 bits (IPv6), making performance predictable.
Exact String Lookups
Speed: ~8 million queries/second
Algorithm: Hash table lookup
Complexity: O(1) constant time
$ matchy bench database.mxy
Exact string lookups: 8,932,441 queries/sec (112ns avg)
Exact strings use hash table lookups, making them the fastest entry type.
Pattern Matching
Speed: ~1-2 million queries/second (with thousands of patterns) Algorithm: Aho-Corasick automaton Complexity: O(n + m) where n = query length, m = number of matches
$ matchy bench database.mxy
Pattern lookups: 2,156,892 queries/sec (463ns avg)
(50,000 patterns in database)
Pattern matching searches all patterns simultaneously. Performance depends on:
- Number of patterns
- Pattern complexity
- Query string length
With thousands of patterns, expect 1-2 microseconds per query.
Loading Performance
Memory Mapping
Databases load via memory mapping, which is nearly instantaneous:
$ time matchy query large-database.mxy 1.2.3.4
real 0m0.003s # 3 milliseconds total (includes query)
Loading time is independent of database size:
- 1MB database: <1ms
- 100MB database: <1ms
- 1GB database: <1ms
The operating system maps the file into virtual memory without reading it entirely.
Traditional Loading (for comparison)
If Matchy used traditional deserialization:
Database Size Estimated Load Time
───────────── ──────────────────
1MB 50-100ms
100MB 5-10 seconds
1GB 50-100 seconds
Memory mapping eliminates this overhead entirely.
Build Performance
Building databases is a one-time cost:
$ time matchy build threats.csv -o threats.mxy
real 0m1.234s # 1.2 seconds for 100,000 entries
Build time depends on:
- Number of entries
- Number of patterns (Aho-Corasick construction)
- Data complexity
- I/O speed (writing output file)
Typical rates:
- IP/strings: ~100,000 entries/second
- Patterns: ~10,000 patterns/second (automaton construction)
Memory Usage
Database Size on Disk
Entry Type Overhead per Entry
────────── ─────────────────
IP address ~8-16 bytes (tree nodes)
CIDR range ~8-16 bytes (tree nodes)
Exact string ~12 bytes + string length (hash table)
Pattern Varies (automaton states)
Plus data storage:
- Small data (few fields): ~20-50 bytes
- Medium data (typical): ~100-500 bytes
- Large data (nested): 1KB+
Memory Usage at Runtime
With memory mapping:
- RSS (Resident Set Size): Only accessed pages loaded
- Shared memory: OS shares pages across processes
- Virtual memory: Full database mapped, but not loaded
Example with 64 processes and a 100MB database:
- Traditional: 64 × 100MB = 6,400MB RAM
- Memory mapped: ~100MB RAM (shared across processes)
The OS loads pages on-demand and shares them automatically.
Optimization Strategies
Use CIDR Ranges
Instead of adding individual IPs:
#![allow(unused)]
fn main() {
// Slow: 256 individual entries
for i in 0..256 {
builder.add_entry(&format!("192.0.2.{}", i), data.clone())?;
}
// Fast: Single CIDR entry
builder.add_entry("192.0.2.0/24", data)?;
}
CIDR ranges are more efficient than individual IPs.
Prefer Exact Strings Over Patterns
When possible, use exact strings:
#![allow(unused)]
fn main() {
// Faster: Hash table lookup
builder.add_entry("exact-domain.com", data)?;
// Slower: Pattern matching
builder.add_entry("exact-domain.*", data)?;
}
Exact strings are 4-8x faster than pattern matching.
Pattern Efficiency
Some patterns are more efficient than others:
#![allow(unused)]
fn main() {
// Efficient: Suffix patterns
builder.add_entry("*.example.com", data)?;
// Less efficient: Multiple wildcards
builder.add_entry("*evil*bad*malware*", data)?;
}
Simple patterns with few wildcards perform better.
Batch Builds
Build databases in batches rather than incrementally:
#![allow(unused)]
fn main() {
// Efficient: Build once
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
for entry in entries {
builder.add_entry(&entry.key, entry.data)?;
}
let db_bytes = builder.build()?;
// Inefficient: Don't rebuild for each entry
// (not even possible - shown for illustration)
}
Databases are immutable, so building happens once.
String Interning for Size Reduction
Added in v1.2.0: Matchy automatically deduplicates repeated string values in database data sections through string interning.
When building databases with redundant metadata, the builder detects duplicate string values and stores them only once:
#![allow(unused)]
fn main() {
// These entries share the same "threat_level": "high" string
builder.add_entry("evil1.com", r#"{"threat_level": "high", "category": "malware"}"#)?;
builder.add_entry("evil2.com", r#"{"threat_level": "high", "category": "phishing"}"#)?;
builder.add_entry("evil3.com", r#"{"threat_level": "high", "category": "spam"}"#)?;
// The string "high" is stored once and referenced three times
}
Benefits:
- Smaller databases: Significant size reduction for datasets with redundant metadata
- Zero query overhead: Interning happens at build time only
- Transparent: No API changes required - works automatically
- Faster loading: Smaller files load faster from disk
Best practices:
- Use consistent field values across entries (e.g., standardized threat levels)
- Normalize string casing and formatting
- String interning works best with categorical data (types, levels, categories)
Example size reduction:
Before v1.2.0: 1,000 entries with repeated "high" threat_level
1,000 × 4 bytes ("high") = 4,000 bytes
After v1.2.0: String interning
1 × 4 bytes ("high") + 1,000 × 4 bytes (references) = 4,004 bytes
Real-world savings: 10-50% database size reduction for typical threat intel datasets
Benchmarking Your Database
Use the CLI to benchmark your specific database:
$ matchy bench threats.mxy
Database: threats.mxy
Size: 15,847,293 bytes
Entries: 125,000
Running benchmarks...
IP lookups: 6,892,443 queries/sec (145ns avg)
Pattern lookups: 1,823,901 queries/sec (548ns avg)
String lookups: 8,234,892 queries/sec (121ns avg)
Completed 3,000,000 queries in 1.234 seconds
This shows real-world performance with your data.
Performance Expectations
By Database Size
Entries DB Size IP Query Pattern Query
────────── ──────── ──────── ─────────────
1,000 ~50KB ~10M/s ~5M/s
10,000 ~500KB ~8M/s ~3M/s
100,000 ~5MB ~7M/s ~2M/s
1,000,000 ~50MB ~6M/s ~1M/s
Performance degrades gracefully as databases grow.
By Pattern Count
Patterns Pattern Query Time
──────── ──────────────────
100 ~200ns
1,000 ~300ns
10,000 ~500ns
50,000 ~1-2μs
100,000 ~3-5μs
Aho-Corasick scales well, but very large pattern counts impact performance.
Production Considerations
Multi-Process Deployment
Memory mapping shines in multi-process scenarios:
┌──────────┐ ┌──────────┐ ┌──────────┐
│ Worker 1 │ │ Worker 2 │ │ Worker N │
└────┬─────┘ └────┬─────┘ └────┬─────┘
│ │ │
└────────────┴────────────┘
│
┌──────────┴──────────┐
│ Database File │
│ (mmap shared) │
└──────────────────────┘
All workers share the same memory pages, dramatically reducing RAM usage.
Database Updates
To update a database:
- Build new database
- Write to temporary file
- Atomic rename over old file
#![allow(unused)]
fn main() {
let db_bytes = builder.build()?;
std::fs::write("threats.mxy.tmp", &db_bytes)?;
std::fs::rename("threats.mxy.tmp", "threats.mxy")?;
}
Existing processes keep reading the old file until they reopen.
Auto-Reload (v1.3.0+)
For zero-downtime updates with automatic reloading:
#![allow(unused)]
fn main() {
// Rust API - automatic reload with ~1-2ns overhead per query
let db = Database::from("threats.mxy")
.watch() // Enable automatic reloading
.open()?;
// Optional: Get notified when reloads happen
let db = Database::from("threats.mxy")
.watch()
.on_reload(|event| {
if event.success {
println!("Database reloaded: generation {}", event.generation);
} else {
eprintln!("Reload failed: {:?}", event.error);
}
})
.open()?;
// Database automatically reloads when file changes
// Queries transparently use the latest version
let result = db.lookup("192.168.1.1")?;
}
Performance characteristics:
- Per-query overhead: ~1-2ns (atomic generation counter check)
- Zero locks on query path after thread-local Arc is cached
- Old database stays alive until all threads finish with it
- 200ms debounce prevents rapid reload cycles
- Scales to 160+ cores without contention
C API:
#include <matchy/matchy.h>
// Callback for reload notifications
void on_reload(const matchy_reload_event_t *event, void *user_data) {
if (event->success) {
printf("Reloaded: %s (gen %lu)\n", event->path, event->generation);
} else {
fprintf(stderr, "Reload failed: %s\n", event->error);
}
}
int main() {
// Configure auto-reload with callback
matchy_open_options_t opts;
matchy_init_open_options(&opts);
opts.auto_reload = true;
opts.reload_callback = on_reload;
opts.reload_callback_user_data = NULL; // Optional context
matchy_t *db = matchy_open_with_options("threats.mxy", &opts);
// Queries automatically use latest database
matchy_result_t result;
matchy_lookup(db, "192.168.1.1", &result);
matchy_close(db);
}
How it works:
- File watcher monitors database file using OS notifications
- On file change, new database is loaded in background thread
- New database is atomically swapped using lock-free Arc pointer
- Each query thread checks generation counter (~1ns atomic load)
- If changed, thread updates its local Arc cache and clears query cache
- All subsequent queries use thread-local Arc (zero overhead!)
When to use:
- Production systems requiring zero downtime
- Threat intelligence feeds updating hourly/daily
- GeoIP databases refreshed periodically
- Any scenario where manual reload coordination is complex
Old queries complete with the old database. New queries use the new database.
Profiling Your Own Code
For developers working on Matchy or optimizing performance:
- Benchmarking Guide - Memory and CPU profiling tools
- Testing Guide - Testing strategies
Next Steps
- Database Concepts - Understanding database structure
- Entry Types - Choosing the right entry type
- Performance Benchmarks - Detailed benchmark results
Matchy Reference
The reference covers the details of various areas of Matchy.
This section provides comprehensive technical documentation for Matchy’s APIs, formats, and internals. For conceptual explanations, see the Matchy Guide.
Rust API
Detailed documentation for using Matchy from Rust:
- The Rust API - Overview and quick reference
- DatabaseBuilder - Building databases
- Database and Querying - Opening and querying
- Data Types Reference - Complete type reference
- Error Handling - Error types and handling
- Validation API - Database validation
C API
Detailed documentation for using Matchy from C/C++:
- The C API - Overview and quick reference
- Building Databases from C - Builder API
- Querying from C - Query API
- Memory Management - Memory rules
Format and Architecture
Technical specifications:
- Binary Format Specification - Database file format
- MMDB Integration Design - MaxMind compatibility
- Input File Formats - CSV, JSON, TSV formats
- Architecture Overview - Internal design
Performance
Detailed performance documentation:
- Performance Benchmarks - Comprehensive benchmark results
The Rust API
This chapter provides an overview of the Rust API. For your first steps with the Rust API, see First Database with Rust.
Core Types
The Matchy Rust API provides these main types:
Building databases:
DatabaseBuilder- Builds new databasesMatchMode- Case sensitivity settingDataValue- Structured data values
Querying databases:
Database- Opened database (read-only)QueryResult- Query match results
Error handling:
MatchyError- Error type for all operationsResult<T>- Standard Rust result type
Quick Reference
Building a Database
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchMode, DataValue};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
let mut data = HashMap::new();
data.insert("field".to_string(), DataValue::String("value".to_string()));
builder.add_entry("192.0.2.1", data)?;
let db_bytes = builder.build()?;
std::fs::write("database.mxy", &db_bytes)?;
}
Querying a Database
#![allow(unused)]
fn main() {
use matchy::{Database, QueryResult};
let db = Database::open("database.mxy")?;
match db.lookup("192.0.2.1")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("IP match: {:?}", data);
}
Some(QueryResult::Pattern { pattern_ids, data }) => {
println!("Pattern match: {} patterns", pattern_ids.len());
}
Some(QueryResult::ExactString { data }) => {
println!("Exact match: {:?}", data);
}
None => println!("No match"),
}
}
Module Structure
#![allow(unused)]
fn main() {
matchy
├── DatabaseBuilder // Building databases
├── Database // Querying databases
├── MatchMode // Case sensitivity enum
├── DataValue // Data type enum
├── QueryResult // Query result enum
└── MatchyError // Error type
}
Error Handling
All operations return Result<T, MatchyError>:
#![allow(unused)]
fn main() {
use matchy::MatchyError;
match builder.build() {
Ok(db_bytes) => { /* success */ }
Err(MatchyError::IoError(e)) => { /* I/O error */ }
Err(MatchyError::InvalidFormat { .. }) => { /* format error */ }
Err(e) => { /* other error */ }
}
}
Common error types:
IoError- File I/O failuresInvalidFormat- Corrupt or wrong database formatInvalidEntry- Invalid key/data during buildingPatternError- Invalid pattern syntax
Type Conversion
From JSON
#![allow(unused)]
fn main() {
use matchy::DataValue;
use serde_json::Value;
let json: Value = serde_json::from_str(r#"{"key": "value"}"#)?;
let data = DataValue::from_json(&json)?;
}
To JSON
#![allow(unused)]
fn main() {
let json = data.to_json()?;
println!("{}", serde_json::to_string_pretty(&json)?);
}
Thread Safety
DatabaseisSend + Sync- safe to share across threadsDatabaseBuilderis!Send + !Sync- use one per thread- Query operations are thread-safe and lock-free
#![allow(unused)]
fn main() {
use std::sync::Arc;
let db = Arc::new(Database::open("database.mxy")?);
// Clone Arc and move to threads
let db_clone = Arc::clone(&db);
std::thread::spawn(move || {
db_clone.lookup("192.0.2.1")
});
}
Memory Mapping
Databases use memory mapping (mmap) for instant loading:
#![allow(unused)]
fn main() {
// Opens instantly regardless of database size
let db = Database::open("large-database.mxy")?;
// Database is memory-mapped, not loaded into heap
}
Benefits:
- Sub-millisecond loading
- Shared pages across processes
- Work with databases larger than RAM
Detailed Documentation
See the following chapters for complete details:
- DatabaseBuilder - Complete builder API
- Database and Querying - Complete query API
- Data Types Reference - All data types
API Documentation
For rustdoc-generated API documentation:
$ cargo doc --open
Or view online at docs.rs/matchy
Examples
See the Examples appendix for complete working examples.
DatabaseBuilder
DatabaseBuilder constructs new databases. See Creating a New Database
for a tutorial.
Creating a Builder
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchMode};
let builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
}
With Schema Validation
Use DatabaseBuilderExt to add automatic schema validation:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, DatabaseBuilderExt, MatchMode, DataValue};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("threatdb")?;
// Entries are validated automatically
let mut data = HashMap::new();
data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
data.insert("category".to_string(), DataValue::String("malware".to_string()));
data.insert("source".to_string(), DataValue::String("abuse.ch".to_string()));
builder.add_entry("1.2.3.4", data)?; // Validated against ThreatDB schema
}
When you use with_schema():
- All entries are validated against the schema before insertion
- The
database_typemetadata is automatically set (e.g.,ThreatDB-v1) - Invalid entries fail immediately with descriptive error messages
See Schemas Reference for available schemas.
Match Modes
MatchMode controls string matching behavior:
MatchMode::CaseInsensitive- “ABC” equals “abc” (recommended for domains)MatchMode::CaseSensitive- “ABC” does not equal “abc”
#![allow(unused)]
fn main() {
// Case-insensitive (recommended)
let builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Case-sensitive
let builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
}
Adding Entries
Method Signature
#![allow(unused)]
fn main() {
pub fn add_entry<S: AsRef<str>>(
&mut self,
key: S,
data: HashMap<String, DataValue>
) -> Result<(), MatchyError>
}
Examples
IP Address:
#![allow(unused)]
fn main() {
let mut data = HashMap::new();
data.insert("country".to_string(), DataValue::String("US".to_string()));
builder.add_entry("192.0.2.1", data)?;
}
CIDR Range:
#![allow(unused)]
fn main() {
let mut data = HashMap::new();
data.insert("org".to_string(), DataValue::String("Example Inc".to_string()));
builder.add_entry("10.0.0.0/8", data)?;
}
Pattern:
#![allow(unused)]
fn main() {
let mut data = HashMap::new();
data.insert("category".to_string(), DataValue::String("search".to_string()));
builder.add_entry("*.google.com", data)?;
}
Exact String:
#![allow(unused)]
fn main() {
let mut data = HashMap::new();
data.insert("safe".to_string(), DataValue::Bool(true));
builder.add_entry("example.com", data)?;
}
Building the Database
Method Signature
#![allow(unused)]
fn main() {
pub fn build(self) -> Result<Vec<u8>, MatchyError>
}
Usage
#![allow(unused)]
fn main() {
let db_bytes = builder.build()?;
std::fs::write("database.mxy", &db_bytes)?;
}
The build() method:
- Consumes the builder (takes ownership)
- Returns
Vec<u8>containing the binary database - Can fail if entries are invalid or memory is exhausted
Complete Example
use matchy::{DatabaseBuilder, MatchMode, DataValue};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Add various entry types
let mut ip_data = HashMap::new();
ip_data.insert("type".to_string(), DataValue::String("ip".to_string()));
builder.add_entry("192.0.2.1", ip_data)?;
let mut cidr_data = HashMap::new();
cidr_data.insert("type".to_string(), DataValue::String("cidr".to_string()));
builder.add_entry("10.0.0.0/8", cidr_data)?;
let mut pattern_data = HashMap::new();
pattern_data.insert("type".to_string(), DataValue::String("pattern".to_string()));
builder.add_entry("*.example.com", pattern_data)?;
// Build and save
let db_bytes = builder.build()?;
std::fs::write("mixed.mxy", &db_bytes)?;
println!("Database size: {} bytes", db_bytes.len());
Ok(())
}
Entry Validation
The builder validates entries when added:
Invalid IP addresses:
#![allow(unused)]
fn main() {
builder.add_entry("256.256.256.256", data)?; // Error: InvalidEntry
}
Invalid CIDR:
#![allow(unused)]
fn main() {
builder.add_entry("10.0.0.0/33", data)?; // Error: InvalidEntry (IPv4 max is /32)
}
Invalid pattern:
#![allow(unused)]
fn main() {
builder.add_entry("[unclosed", data)?; // Error: PatternError
}
Schema Validation
When a schema is configured via with_schema(), data is validated against the schema:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, DatabaseBuilderExt, MatchMode, DataValue};
use std::collections::HashMap;
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("threatdb")?;
// Missing required fields
let mut bad_data = HashMap::new();
bad_data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
// Missing: category, source
builder.add_entry("1.2.3.4", bad_data)?;
// Error: Validation error: Entry '1.2.3.4': "category" is a required property
// Invalid enum value
let mut bad_enum = HashMap::new();
bad_enum.insert("threat_level".to_string(), DataValue::String("extreme".to_string())); // Invalid!
bad_enum.insert("category".to_string(), DataValue::String("malware".to_string()));
bad_enum.insert("source".to_string(), DataValue::String("test".to_string()));
builder.add_entry("2.3.4.5", bad_enum)?;
// Error: Validation error: Entry '2.3.4.5': "extreme" is not one of ["critical","high","medium","low","unknown"]
}
Custom Validators
For custom validation logic, implement the EntryValidator trait:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, EntryValidator, MatchMode, DataValue};
use matchy_format::FormatError;
use std::collections::HashMap;
use std::error::Error;
struct RequiredFieldValidator {
required_fields: Vec<String>,
}
impl EntryValidator for RequiredFieldValidator {
fn validate(
&self,
key: &str,
data: &HashMap<String, DataValue>,
) -> Result<(), Box<dyn Error + Send + Sync>> {
for field in &self.required_fields {
if !data.contains_key(field) {
return Err(format!(
"Entry '{}': missing required field '{}'",
key, field
).into());
}
}
Ok(())
}
}
let validator = RequiredFieldValidator {
required_fields: vec!["name".to_string(), "category".to_string()],
};
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_validator(Box::new(validator));
}
Building Large Databases
For large databases, add entries in a loop:
#![allow(unused)]
fn main() {
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
for entry in large_dataset {
let mut data = HashMap::new();
data.insert("value".to_string(), DataValue::from_json(&entry.data)?);
builder.add_entry(&entry.key, data)?;
}
let db_bytes = builder.build()?;
}
Performance: ~100,000 IP/string entries per second, ~10,000 patterns per second.
Error Handling
#![allow(unused)]
fn main() {
match builder.add_entry(key, data) {
Ok(()) => println!("Added entry"),
Err(MatchyError::InvalidEntry { key, reason }) => {
eprintln!("Invalid entry {}: {}", key, reason);
}
Err(MatchyError::PatternError { pattern, reason }) => {
eprintln!("Invalid pattern {}: {}", pattern, reason);
}
Err(e) => eprintln!("Other error: {}", e),
}
}
See Also
- Database and Querying - Querying databases
- Data Types Reference - DataValue types
- First Database with Rust - Tutorial
Database and Querying
Database opens and queries databases. See First Database with Rust
for a tutorial.
Opening a Database
Basic Opening
#![allow(unused)]
fn main() {
use matchy::Database;
// Simple - uses defaults (cache enabled, validation on)
let db = Database::from("database.mxy").open()?;
}
The database is memory-mapped and loads in under 1 millisecond regardless of size.
Builder API
The recommended way to open databases uses the fluent builder API:
#![allow(unused)]
fn main() {
use matchy::Database;
// With custom cache size
let db = Database::from("database.mxy")
.cache_capacity(1000)
.open()?;
// Large cache for high repetition workloads
let db = Database::from("threats.mxy")
.cache_capacity(100_000)
.open()?;
// No cache (for unique queries)
let db = Database::from("database.mxy")
.no_cache()
.open()?;
}
Builder Methods
| Method | Description |
|---|---|
.cache_capacity(size) | Set LRU cache size (default: 10,000) |
.no_cache() | Disable caching entirely |
.open() | Load the database |
Cache Size Guidelines:
0(via.no_cache()): No caching - best for diverse queries100-1000: Good for moderate repetition10,000(default): Optimal for typical workloads100,000+: For very high repetition (80%+ hit rate)
Note: Caching only benefits pattern lookups with high repetition. IP and literal lookups are already fast and don’t benefit from caching.
Error Handling
#![allow(unused)]
fn main() {
match Database::open("database.mxy") {
Ok(db) => { /* success */ }
Err(MatchyError::FileNotFound { path }) => {
eprintln!("Database not found: {}", path);
}
Err(MatchyError::InvalidFormat { reason }) => {
eprintln!("Invalid database format: {}", reason);
}
Err(e) => eprintln!("Error: {}", e),
}
}
Querying
lookup() - Direct String Lookup
#![allow(unused)]
fn main() {
pub fn lookup<S: AsRef<str>>(&self, query: S) -> Result<Option<QueryResult>, MatchyError>
}
Basic usage:
#![allow(unused)]
fn main() {
match db.lookup("192.0.2.1")? {
Some(result) => println!("Found: {:?}", result),
None => println!("Not found"),
}
}
lookup_extracted() - Lookup After Extraction
#![allow(unused)]
fn main() {
pub fn lookup_extracted(
&self,
item: &matchy::extractor::Match,
input: &[u8],
) -> Result<Option<QueryResult>, DatabaseError>
}
Efficient lookup for extracted patterns. Automatically uses the optimal lookup path:
- IP addresses use typed
lookup_ip()(avoids string parsing) - Other types use string-based
lookup()
Usage:
#![allow(unused)]
fn main() {
use matchy::{Database, extractor::Extractor};
let db = Database::from("threats.mxy").open()?;
let extractor = Extractor::new()?;
let log_line = b"Connection from 192.168.1.1 to evil.com";
for item in extractor.extract_from_line(log_line) {
if let Some(result) = db.lookup_extracted(&item, log_line)? {
println!("Match: {} (type: {})",
item.as_str(log_line),
item.item.type_name()
);
}
}
}
Why use this?
- Cleaner code: No manual matching on
ExtractedItemvariants - Better performance: IP addresses use direct typed lookups
- Future-proof: New extracted types work automatically
Parameters:
item: The extracted match fromExtractorinput: Original input buffer (needed to extract string slices)
Returns: Ok(Some(QueryResult)) if found, Ok(None) if not found
See the Querying guide for more examples.
QueryResult Types
QueryResult is an enum with three variants:
IP Match
#![allow(unused)]
fn main() {
QueryResult::Ip {
data: Option<HashMap<String, DataValue>>,
prefix_len: u8,
}
}
Example:
#![allow(unused)]
fn main() {
match db.lookup("192.0.2.1")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("Matched IP with prefix /{}", prefix_len);
if let Some(d) = data {
println!("Data: {:?}", d);
}
}
_ => {}
}
}
Pattern Match
#![allow(unused)]
fn main() {
QueryResult::Pattern {
pattern_ids: Vec<u32>,
data: Vec<Option<HashMap<String, DataValue>>>,
}
}
Example:
#![allow(unused)]
fn main() {
match db.lookup("mail.google.com")? {
Some(QueryResult::Pattern { pattern_ids, data }) => {
println!("Matched {} pattern(s)", pattern_ids.len());
for (i, pattern_data) in data.iter().enumerate() {
println!("Pattern {}: {:?}", pattern_ids[i], pattern_data);
}
}
_ => {}
}
}
Note: A query can match multiple patterns. All matching patterns are returned.
Exact String Match
#![allow(unused)]
fn main() {
QueryResult::ExactString {
data: Option<HashMap<String, DataValue>>,
}
}
Example:
#![allow(unused)]
fn main() {
match db.lookup("example.com")? {
Some(QueryResult::ExactString { data }) => {
println!("Exact match: {:?}", data);
}
_ => {}
}
}
Complete Example
use matchy::{Database, QueryResult};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let db = Database::open("database.mxy")?;
// Query different types
let queries = vec![
"192.0.2.1", // IP
"10.5.5.5", // CIDR
"test.example.com", // Pattern
"example.com", // Exact string
];
for query in queries {
match db.lookup(query)? {
Some(QueryResult::Ip { prefix_len, .. }) => {
println!("{}: IP match (/{prefix_len})", query);
}
Some(QueryResult::Pattern { pattern_ids, .. }) => {
println!("{}: Pattern match ({} patterns)", query, pattern_ids.len());
}
Some(QueryResult::ExactString { .. }) => {
println!("{}: Exact match", query);
}
None => {
println!("{}: No match", query);
}
}
}
Ok(())
}
Thread Safety
Database is Send + Sync and can be safely shared across threads:
#![allow(unused)]
fn main() {
use std::sync::Arc;
use std::thread;
let db = Arc::new(Database::open("database.mxy")?);
let handles: Vec<_> = (0..4).map(|i| {
let db = Arc::clone(&db);
thread::spawn(move || {
db.lookup(&format!("192.0.2.{}", i))
})
}).collect();
for handle in handles {
handle.join().unwrap()?;
}
}
Performance
Query performance by entry type:
- IP addresses: ~7 million queries/second (138ns avg)
- Exact strings: ~8 million queries/second (112ns avg)
- Patterns: ~1-2 million queries/second (500ns-1μs avg)
See Performance Considerations for details.
Database Statistics
Get Statistics
Retrieve comprehensive statistics about database usage:
#![allow(unused)]
fn main() {
use matchy::Database;
let db = Database::from("threats.mxy").open()?;
// Do some queries
db.lookup("1.2.3.4")?;
db.lookup("example.com")?;
db.lookup("test.com")?;
// Get stats
let stats = db.stats();
println!("Total queries: {}", stats.total_queries);
println!("Queries with match: {}", stats.queries_with_match);
println!("Cache hit rate: {:.1}%", stats.cache_hit_rate() * 100.0);
println!("Match rate: {:.1}%", stats.match_rate() * 100.0);
println!("IP queries: {}", stats.ip_queries);
println!("String queries: {}", stats.string_queries);
}
DatabaseStats Structure
#![allow(unused)]
fn main() {
pub struct DatabaseStats {
pub total_queries: u64,
pub queries_with_match: u64,
pub queries_without_match: u64,
pub cache_hits: u64,
pub cache_misses: u64,
pub ip_queries: u64,
pub string_queries: u64,
}
impl DatabaseStats {
pub fn cache_hit_rate(&self) -> f64
pub fn match_rate(&self) -> f64
}
}
Helper Methods:
cache_hit_rate()- Returns cache hit rate as a value from 0.0 to 1.0match_rate()- Returns query match rate as a value from 0.0 to 1.0
Interpreting Statistics
Cache Performance:
- Hit rate < 50%: Consider disabling cache (
.no_cache()) - Hit rate 50-80%: Cache is helping moderately
- Hit rate > 80%: Cache is very effective
Query Distribution:
- High
ip_queries: Database is being used for IP lookups - High
string_queries: Database is being used for domain/pattern matching
Cache Management
Clear Cache
Remove all cached query results:
#![allow(unused)]
fn main() {
use matchy::Database;
let db = Database::from("threats.mxy").open()?;
// Do some queries (fills cache)
db.lookup("example.com")?;
// Clear cache to force fresh lookups
db.clear_cache();
}
Useful for benchmarking or when you need to ensure fresh lookups without reopening the database.
Helper Methods
Checking Entry Types
#![allow(unused)]
fn main() {
if let Some(QueryResult::Ip { .. }) = result {
// Handle IP match
}
}
Or using match guards:
#![allow(unused)]
fn main() {
match db.lookup(query)? {
Some(QueryResult::Ip { prefix_len, .. }) if prefix_len == 32 => {
println!("Exact IP match");
}
Some(QueryResult::Ip { prefix_len, .. }) => {
println!("CIDR match /{}", prefix_len);
}
_ => {}
}
}
Database Lifecycle
Databases are immutable once opened:
#![allow(unused)]
fn main() {
let db = Database::open("database.mxy")?;
// db.lookup(...) - OK
// db.add_entry(...) - No such method!
}
To update a database:
- Build a new database with
DatabaseBuilder - Write to a temporary file
- Atomically replace the old database
#![allow(unused)]
fn main() {
// Build new database
let db_bytes = builder.build()?;
std::fs::write("database.mxy.tmp", &db_bytes)?;
std::fs::rename("database.mxy.tmp", "database.mxy")?;
// Reopen
let db = Database::open("database.mxy")?;
}
See Also
- DatabaseBuilder - Building databases
- Data Types Reference - Data value types
- Performance Considerations - Optimization
Data Types Reference
Matchy databases store arbitrary data with each entry using the DataValue type system.
Overview
DataValue is a Rust enum supporting these types:
- Bool: Boolean values
- U16: 16-bit unsigned integers
- U32: 32-bit unsigned integers
- U64: 64-bit unsigned integers
- I32: 32-bit signed integers
- F32: 32-bit floating point
- F64: 64-bit floating point
- String: UTF-8 text
- Bytes: Arbitrary binary data
- Array: Ordered list of values
- Map: Key-value mappings
- Timestamp: Unix epoch seconds (compact storage for ISO 8601 timestamps)
See Data Types for conceptual overview.
DataValue Enum
#![allow(unused)]
fn main() {
pub enum DataValue {
Bool(bool),
U16(u16),
U32(u32),
U64(u64),
I32(i32),
F32(f32),
F64(f64),
String(String),
Bytes(Vec<u8>),
Array(Vec<DataValue>),
Map(HashMap<String, DataValue>),
Timestamp(i64), // Unix epoch seconds
}
}
Creating Values
Direct Construction
#![allow(unused)]
fn main() {
use matchy::DataValue;
let bool_val = DataValue::Bool(true);
let int_val = DataValue::U32(42);
let str_val = DataValue::String("hello".to_string());
}
Using From/Into
#![allow(unused)]
fn main() {
let val: DataValue = 42u32.into();
let val: DataValue = "text".to_string().into();
let val: DataValue = true.into();
}
Working with Maps
Maps are the most common data structure:
#![allow(unused)]
fn main() {
use std::collections::HashMap;
use matchy::DataValue;
let mut data = HashMap::new();
data.insert("country".to_string(), DataValue::String("US".to_string()));
data.insert("asn".to_string(), DataValue::U32(15169));
data.insert("lat".to_string(), DataValue::F64(37.751));
data.insert("lon".to_string(), DataValue::F64(-97.822));
}
Working with Arrays
#![allow(unused)]
fn main() {
let tags = DataValue::Array(vec![
DataValue::String("cdn".to_string()),
DataValue::String("cloud".to_string()),
]);
data.insert("tags".to_string(), tags);
}
Working with Timestamps
Timestamps store Unix epoch seconds compactly (8 bytes vs 27-byte ISO 8601 strings):
#![allow(unused)]
fn main() {
use matchy::DataValue;
let first_seen = DataValue::Timestamp(1727891071);
data.insert("first_seen".to_string(), first_seen);
}
ISO 8601 strings in JSON input are automatically parsed into Timestamps during deserialization:
{
"entry": "1.2.3.4",
"first_seen": "2025-10-02T18:44:31Z"
}
When serialized back to JSON, Timestamps render as ISO 8601 strings for readability.
Nested Structures
#![allow(unused)]
fn main() {
let mut location = HashMap::new();
location.insert("city".to_string(), DataValue::String("Mountain View".to_string()));
location.insert("country".to_string(), DataValue::String("US".to_string()));
data.insert("location".to_string(), DataValue::Map(location));
}
Type Conversion
Extracting Values
#![allow(unused)]
fn main() {
match value {
DataValue::String(s) => println!("String: {}", s),
DataValue::U32(n) => println!("Number: {}", n),
DataValue::Map(m) => {
for (k, v) in m {
println!("{}: {:?}", k, v);
}
}
_ => println!("Other type"),
}
}
Helper Functions
#![allow(unused)]
fn main() {
fn get_string(val: &DataValue) -> Option<&str> {
match val {
DataValue::String(s) => Some(s),
_ => None,
}
}
fn get_u32(val: &DataValue) -> Option<u32> {
match val {
DataValue::U32(n) => Some(*n),
_ => None,
}
}
}
Complete Example
use matchy::{DatabaseBuilder, DataValue};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = DatabaseBuilder::new();
// IP with rich data
let mut ip_data = HashMap::new();
ip_data.insert("country".to_string(), DataValue::String("US".to_string()));
ip_data.insert("asn".to_string(), DataValue::U32(15169));
ip_data.insert("tags".to_string(), DataValue::Array(vec![
DataValue::String("datacenter".to_string()),
DataValue::String("cloud".to_string()),
]));
builder.add_ip_entry("8.8.8.8/32", Some(ip_data))?;
// Pattern with metadata
let mut pattern_data = HashMap::new();
pattern_data.insert("category".to_string(), DataValue::String("search".to_string()));
pattern_data.insert("priority".to_string(), DataValue::U16(100));
builder.add_pattern_entry("*.google.com", Some(pattern_data))?;
let db_bytes = builder.build()?;
std::fs::write("database.mxy", &db_bytes)?;
Ok(())
}
Binary Format
DataValue types are serialized to the MMDB binary format:
| DataValue | MMDB Type | Notes |
|---|---|---|
| Bool | boolean | 1 bit |
| U16 | uint16 | 2 bytes |
| U32 | uint32 | 4 bytes |
| U64 | uint64 | 8 bytes |
| I32 | int32 | 4 bytes |
| F32 | float | IEEE 754 |
| F64 | double | IEEE 754 |
| String | utf8_string | Length-prefixed |
| Bytes | bytes | Length-prefixed |
| Array | array | Recursive |
| Map | map | Key-value pairs |
| Timestamp | ext 128 | 8 bytes, Matchy extension |
See Binary Format for encoding details.
Size Limits
- Strings: Up to 16 MB per string
- Bytes: Up to 16 MB per byte array
- Arrays: Up to 65,536 elements
- Maps: Up to 65,536 key-value pairs
- Nesting: Up to 64 levels deep
Performance
Data types have different serialization costs:
| Type | Cost | Notes |
|---|---|---|
| Bool, integers | O(1) | Fixed size |
| F32, F64 | O(1) | Fixed size |
| String | O(n) | Length-dependent |
| Bytes | O(n) | Length-dependent |
| Array | O(n × m) | n = length, m = element cost |
| Map | O(n × m) | n = entries, m = value cost |
Prefer smaller types when possible:
- Use U16 instead of U32 if values fit
- Use I32 instead of F64 for integers
- Avoid deep nesting
Serialization Example
#![allow(unused)]
fn main() {
use matchy::{Database, QueryResult, DataValue};
let db = Database::open("database.mxy")?;
if let Some(QueryResult::Ip { data: Some(data), .. }) = db.lookup("8.8.8.8")? {
// Extract specific fields
if let Some(DataValue::String(country)) = data.get("country") {
println!("Country: {}", country);
}
if let Some(DataValue::U32(asn)) = data.get("asn") {
println!("ASN: {}", asn);
}
if let Some(DataValue::Array(tags)) = data.get("tags") {
println!("Tags:");
for tag in tags {
if let DataValue::String(s) = tag {
println!(" - {}", s);
}
}
}
}
}
JSON Conversion
DataValue maps naturally to JSON:
#![allow(unused)]
fn main() {
use serde_json::json;
// DataValue to JSON (conceptual)
fn to_json(val: &DataValue) -> serde_json::Value {
match val {
DataValue::Bool(b) => json!(b),
DataValue::U32(n) => json!(n),
DataValue::String(s) => json!(s),
DataValue::Array(arr) => {
json!(arr.iter().map(to_json).collect::<Vec<_>>())
}
DataValue::Map(map) => {
let obj: serde_json::Map<String, serde_json::Value> =
map.iter().map(|(k, v)| (k.clone(), to_json(v))).collect();
json!(obj)
}
_ => json!(null),
}
}
}
See Also
- Data Types Guide - Conceptual overview
- DatabaseBuilder - Adding data
- Database Querying - Reading data
- Binary Format - Serialization details
Error Handling Reference
All fallible operations in Matchy return Result<T, MatchyError>.
MatchyError Type
#![allow(unused)]
fn main() {
pub enum MatchyError {
/// File does not exist
FileNotFound { path: String },
/// Invalid database format
InvalidFormat { reason: String },
/// Corrupted database data
CorruptData { offset: usize, reason: String },
/// Invalid entry (IP, pattern, string)
InvalidEntry { entry: String, reason: String },
/// I/O error
IoError(std::io::Error),
/// Memory mapping failed
MmapError(String),
/// Pattern compilation failed
PatternError { pattern: String, reason: String },
/// Internal error
InternalError(String),
}
}
Common Error Patterns
Opening a Database
#![allow(unused)]
fn main() {
use matchy::{Database, MatchyError};
match Database::open("database.mxy") {
Ok(db) => { /* success */ }
Err(MatchyError::FileNotFound { path }) => {
eprintln!("Database not found: {}", path);
// Handle missing file - maybe create default?
}
Err(MatchyError::InvalidFormat { reason }) => {
eprintln!("Invalid format: {}", reason);
// File exists but not valid matchy database
}
Err(MatchyError::CorruptData { offset, reason }) => {
eprintln!("Corrupted at offset {}: {}", offset, reason);
// Database is damaged - rebuild required
}
Err(e) => {
eprintln!("Unexpected error: {}", e);
return Err(e.into());
}
}
}
Building a Database
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, MatchyError};
let mut builder = DatabaseBuilder::new();
// Add entries with error handling
match builder.add_ip_entry("192.0.2.1/32", None) {
Ok(_) => {}
Err(MatchyError::InvalidEntry { entry, reason }) => {
eprintln!("Invalid IP '{}': {}", entry, reason);
// Skip this entry and continue
}
Err(e) => return Err(e.into()),
}
// Build with error handling
match builder.build() {
Ok(bytes) => {
std::fs::write("database.mxy", &bytes)?;
}
Err(MatchyError::InternalError(msg)) => {
eprintln!("Build failed: {}", msg);
return Err(msg.into());
}
Err(e) => return Err(e.into()),
}
}
Querying
#![allow(unused)]
fn main() {
use matchy::{Database, MatchyError};
let db = Database::open("database.mxy")?;
match db.lookup("example.com") {
Ok(Some(result)) => {
println!("Found: {:?}", result);
}
Ok(None) => {
println!("Not found");
}
Err(MatchyError::CorruptData { offset, reason }) => {
eprintln!("Data corruption at {}: {}", offset, reason);
// Database may be partially readable
}
Err(e) => {
eprintln!("Lookup error: {}", e);
return Err(e.into());
}
}
}
Error Context
Use context methods to add helpful information:
#![allow(unused)]
fn main() {
use matchy::Database;
fn load_db(path: &str) -> Result<Database, Box<dyn std::error::Error>> {
Database::open(path)
.map_err(|e| format!("Failed to load database from '{}': {}", path, e).into())
}
}
Or with anyhow:
#![allow(unused)]
fn main() {
use anyhow::{Context, Result};
use matchy::Database;
fn load_db(path: &str) -> Result<Database> {
Database::open(path)
.with_context(|| format!("Failed to load database from '{}'", path))
}
}
Validation Errors
IP Address Validation
#![allow(unused)]
fn main() {
builder.add_ip_entry("not-an-ip", None)?;
// Error: InvalidEntry { entry: "not-an-ip", reason: "Invalid IP address" }
builder.add_ip_entry("192.0.2.1/33", None)?;
// Error: InvalidEntry { entry: "192.0.2.1/33", reason: "Invalid prefix length" }
}
Pattern Validation
#![allow(unused)]
fn main() {
builder.add_pattern_entry("*.*.com", None)?;
// Error: PatternError { pattern: "*.*.com", reason: "Multiple wildcards" }
builder.add_pattern_entry("[invalid", None)?;
// Error: PatternError { pattern: "[invalid", reason: "Unclosed bracket" }
}
String Validation
#![allow(unused)]
fn main() {
builder.add_exact_entry("", None)?;
// Error: InvalidEntry { entry: "", reason: "Empty string" }
}
Error Recovery
Partial Success
Continue after validation errors:
#![allow(unused)]
fn main() {
let entries = vec!["192.0.2.1", "not-valid", "10.0.0.1"];
let mut success_count = 0;
let mut error_count = 0;
for entry in entries {
match builder.add_ip_entry(entry, None) {
Ok(_) => success_count += 1,
Err(e) => {
eprintln!("Skipping invalid entry '{}': {}", entry, e);
error_count += 1;
}
}
}
println!("Added {} entries, skipped {} invalid", success_count, error_count);
}
Fallback Databases
#![allow(unused)]
fn main() {
let db = Database::open("primary.mxy")
.or_else(|_| Database::open("backup.mxy"))
.or_else(|_| Database::open("default.mxy"))?;
}
Retry Logic
#![allow(unused)]
fn main() {
use std::time::Duration;
use std::thread;
fn open_with_retry(path: &str, max_attempts: u32) -> Result<Database, MatchyError> {
for attempt in 1..=max_attempts {
match Database::open(path) {
Ok(db) => return Ok(db),
Err(MatchyError::IoError(_)) if attempt < max_attempts => {
eprintln!("Attempt {} failed, retrying...", attempt);
thread::sleep(Duration::from_millis(100 * attempt as u64));
}
Err(e) => return Err(e),
}
}
unreachable!()
}
}
Display Implementation
All errors implement Display:
#![allow(unused)]
fn main() {
use matchy::MatchyError;
let err = MatchyError::FileNotFound {
path: "missing.mxy".to_string()
};
println!("{}", err);
// Output: Database file not found: missing.mxy
eprintln!("Error: {}", err);
// Stderr: Error: Database file not found: missing.mxy
}
Error Conversion
To std::io::Error
#![allow(unused)]
fn main() {
impl From<MatchyError> for std::io::Error {
fn from(err: MatchyError) -> Self {
match err {
MatchyError::FileNotFound { path } => {
std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Database not found: {}", path)
)
}
MatchyError::IoError(e) => e,
_ => std::io::Error::new(std::io::ErrorKind::Other, err.to_string()),
}
}
}
}
To Box<dyn Error>
#![allow(unused)]
fn main() {
fn do_work() -> Result<(), Box<dyn std::error::Error>> {
let db = Database::open("db.mxy")?;
// MatchyError automatically converts
Ok(())
}
}
Best Practices
1. Match Specific Errors First
#![allow(unused)]
fn main() {
match db.lookup(query) {
Ok(Some(result)) => { /* handle result */ }
Ok(None) => { /* handle not found */ }
Err(MatchyError::CorruptData { .. }) => { /* handle corruption */ }
Err(e) => { /* generic handler */ }
}
}
2. Provide Context
#![allow(unused)]
fn main() {
builder.add_ip_entry(ip, data)
.map_err(|e| format!("Failed to add IP '{}': {}", ip, e))?;
}
3. Log Errors
#![allow(unused)]
fn main() {
use log::{error, warn};
match Database::open(path) {
Ok(db) => db,
Err(e) => {
error!("Failed to open database '{}': {}", path, e);
return Err(e.into());
}
}
}
4. Use Result Type Aliases
#![allow(unused)]
fn main() {
type Result<T> = std::result::Result<T, MatchyError>;
fn my_function() -> Result<Database> {
Database::open("database.mxy")
}
}
Complete Example
use matchy::{Database, DatabaseBuilder, MatchyError};
use std::fs;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Try to open existing database
let db = match Database::open("cache.mxy") {
Ok(db) => {
println!("Loaded existing database");
db
}
Err(MatchyError::FileNotFound { .. }) => {
println!("Building new database...");
build_database()?
}
Err(e) => {
eprintln!("Error opening database: {}", e);
return Err(e.into());
}
};
// Query with error handling
let queries = vec!["192.0.2.1", "example.com", "*.google.com"];
for query in queries {
match db.lookup(query) {
Ok(Some(result)) => {
println!("{}: {:?}", query, result);
}
Ok(None) => {
println!("{}: Not found", query);
}
Err(e) => {
eprintln!("{}: Error - {}", query, e);
}
}
}
Ok(())
}
fn build_database() -> Result<Database, Box<dyn std::error::Error>> {
let mut builder = DatabaseBuilder::new();
// Add entries with individual error handling
let entries = vec![
("192.0.2.1", "Valid IP"),
("not-an-ip", "Invalid - will skip"),
("10.0.0.0/8", "Valid CIDR"),
];
for (entry, description) in entries {
match builder.add_ip_entry(entry, None) {
Ok(_) => println!("Added: {} ({})", entry, description),
Err(e) => eprintln!("Skipped: {} - {}", entry, e),
}
}
// Build and save
let db_bytes = builder.build()?;
fs::write("cache.mxy", &db_bytes)?;
// Reopen
Database::open("cache.mxy").map_err(Into::into)
}
See Also
- DatabaseBuilder - Building with validation
- Database Querying - Query errors
- Rust Error Handling
Validation API
Programmatic database validation for Rust applications.
Overview
The validation API allows you to validate Matchy databases from Rust code before loading them. This is essential when working with databases from untrusted sources or when you need detailed validation reports.
#![allow(unused)]
fn main() {
use matchy::validation::{validate_database, ValidationLevel};
use std::path::Path;
let report = validate_database(Path::new("database.mxy"), ValidationLevel::Strict)?;
if report.is_valid() {
println!("✓ Database is safe to use");
// Safe to open and use
let db = Database::open("database.mxy")?;
} else {
eprintln!("✗ Validation failed:");
for error in &report.errors {
eprintln!(" - {}", error);
}
}
}
Main Function
validate_database
#![allow(unused)]
fn main() {
pub fn validate_database(
path: &Path,
level: ValidationLevel
) -> Result<ValidationReport, MatchyError>
}
Validates a database file and returns a detailed report.
Parameters:
path- Path to the.mxydatabase filelevel- Validation strictness level
Returns: ValidationReport with errors, warnings, and statistics
Example:
#![allow(unused)]
fn main() {
use matchy::validation::{validate_database, ValidationLevel};
use std::path::Path;
let report = validate_database(
Path::new("database.mxy"),
ValidationLevel::Strict
)?;
println!("Validation complete:");
println!(" Errors: {}", report.errors.len());
println!(" Warnings: {}", report.warnings.len());
println!(" {}", report.stats.summary());
}
ValidationLevel
#![allow(unused)]
fn main() {
pub enum ValidationLevel {
Standard, // Basic safety checks
Strict, // Deep analysis (default)
Audit, // Security audit mode
}
}
Standard
Fast validation with essential checks:
- File format structure
- Offset bounds checking
- UTF-8 string validity
- Basic graph structure
#![allow(unused)]
fn main() {
let report = validate_database(path, ValidationLevel::Standard)?;
}
Strict (Recommended)
Comprehensive validation including:
- All standard checks
- Cycle detection
- Redundancy analysis
- Deep consistency checks
- Pattern reachability
#![allow(unused)]
fn main() {
let report = validate_database(path, ValidationLevel::Strict)?;
}
Audit
All strict checks plus security analysis:
- Track unsafe code locations
- Document trust assumptions
- Report validation bypasses
#![allow(unused)]
fn main() {
let report = validate_database(path, ValidationLevel::Audit)?;
if report.is_valid() {
println!("Unsafe code locations: {}",
report.stats.unsafe_code_locations.len());
println!("Trust assumptions: {}",
report.stats.trust_assumptions.len());
}
}
ValidationReport
#![allow(unused)]
fn main() {
pub struct ValidationReport {
pub errors: Vec<String>,
pub warnings: Vec<String>,
pub info: Vec<String>,
pub stats: DatabaseStats,
}
}
Methods
is_valid()
#![allow(unused)]
fn main() {
pub fn is_valid(&self) -> bool
}
Returns true if there are no errors (warnings are allowed).
#![allow(unused)]
fn main() {
if report.is_valid() {
// Safe to use
let db = Database::open(path)?;
}
}
Fields
errors
Critical errors that make the database unusable:
#![allow(unused)]
fn main() {
if !report.errors.is_empty() {
eprintln!("Critical errors found:");
for error in &report.errors {
eprintln!(" ❌ {}", error);
}
}
}
warnings
Non-fatal issues that may indicate problems:
#![allow(unused)]
fn main() {
if !report.warnings.is_empty() {
println!("Warnings:");
for warning in &report.warnings {
println!(" ⚠️ {}", warning);
}
}
}
info
Informational messages about the validation process:
#![allow(unused)]
fn main() {
for info in &report.info {
println!(" ℹ️ {}", info);
}
}
DatabaseStats
#![allow(unused)]
fn main() {
pub struct DatabaseStats {
pub file_size: usize,
pub version: u32,
pub ac_node_count: u32,
pub pattern_count: u32,
pub ip_entry_count: u32,
pub literal_count: u32,
pub glob_count: u32,
pub string_data_size: u32,
pub has_data_section: bool,
pub has_ac_literal_mapping: bool,
pub max_ac_depth: u8,
pub state_encoding_distribution: [u32; 4],
pub unsafe_code_locations: Vec<UnsafeCodeLocation>,
pub trust_assumptions: Vec<TrustAssumption>,
}
}
Methods
summary()
#![allow(unused)]
fn main() {
pub fn summary(&self) -> String
}
Returns a human-readable summary:
#![allow(unused)]
fn main() {
println!("{}", report.stats.summary());
// Output: "Version: v2, Nodes: 1234, Patterns: 56 (20 literal, 36 glob), IPs: 100, Size: 128 KB"
}
Example Usage
#![allow(unused)]
fn main() {
let stats = &report.stats;
println!("Database Statistics:");
println!(" File size: {} KB", stats.file_size / 1024);
println!(" Version: v{}", stats.version);
println!(" Patterns: {} ({} literal, {} glob)",
stats.pattern_count, stats.literal_count, stats.glob_count);
println!(" IP entries: {}", stats.ip_entry_count);
println!(" AC nodes: {}", stats.ac_node_count);
println!(" Max depth: {}", stats.max_ac_depth);
}
Complete Example
use matchy::{Database, validation::{validate_database, ValidationLevel}};
use std::path::Path;
fn load_safe_database(path: &Path) -> Result<Database, Box<dyn std::error::Error>> {
// Validate first
let report = validate_database(path, ValidationLevel::Strict)?;
// Check for errors
if !report.is_valid() {
eprintln!("Database validation failed:");
for error in &report.errors {
eprintln!(" ❌ {}", error);
}
return Err("Validation failed".into());
}
// Show warnings if any
if !report.warnings.is_empty() {
println!("⚠️ Warnings:");
for warning in &report.warnings {
println!(" • {}", warning);
}
}
// Display stats
println!("✓ Validation passed");
println!(" {}", report.stats.summary());
// Safe to open
Ok(Database::open(path)?)
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let db = load_safe_database(Path::new("database.mxy"))?;
// Use database safely
if let Some(result) = db.lookup("example.com")? {
println!("Found: {:?}", result);
}
Ok(())
}
Validation in Production
Pattern: Validate Once, Use Many Times
#![allow(unused)]
fn main() {
use std::sync::Arc;
use std::collections::HashMap;
use parking_lot::RwLock;
struct DatabaseCache {
databases: Arc<RwLock<HashMap<String, Arc<Database>>>>,
}
impl DatabaseCache {
fn load(&self, path: &str) -> Result<Arc<Database>, Box<dyn std::error::Error>> {
// Check cache first
{
let cache = self.databases.read();
if let Some(db) = cache.get(path) {
return Ok(Arc::clone(db));
}
}
// Validate before loading
let report = validate_database(
Path::new(path),
ValidationLevel::Strict
)?;
if !report.is_valid() {
return Err(format!(
"Database validation failed with {} errors",
report.errors.len()
).into());
}
// Load and cache
let db = Arc::new(Database::open(path)?);
let mut cache = self.databases.write();
cache.insert(path.to_string(), Arc::clone(&db));
Ok(db)
}
}
}
Pattern: Background Validation
#![allow(unused)]
fn main() {
use std::sync::mpsc;
use std::thread;
use std::time::Duration;
fn validate_database_async(
path: String,
) -> Result<mpsc::Receiver<ValidationReport>, Box<dyn std::error::Error>> {
let (tx, rx) = mpsc::channel();
thread::spawn(move || {
let report = validate_database(
Path::new(&path),
ValidationLevel::Standard
);
if let Ok(report) = report {
let _ = tx.send(report);
}
});
Ok(rx)
}
// Usage
let rx = validate_database_async("large.mxy".to_string())?;
// Do other work...
// Check result when ready
if let Ok(report) = rx.recv_timeout(Duration::from_secs(5)) {
if report.is_valid() {
let db = Database::open("large.mxy")?;
}
}
}
Error Handling
Validation errors are separate from database errors:
#![allow(unused)]
fn main() {
use matchy::{MatchyError, validation::ValidationLevel};
match validate_database(path, ValidationLevel::Strict) {
Ok(report) if report.is_valid() => {
// Database is valid
println!("✓ Database validated");
}
Ok(report) => {
// Validation completed but found errors
eprintln!("✗ Database has {} errors", report.errors.len());
for error in &report.errors {
eprintln!(" - {}", error);
}
}
Err(MatchyError::FileNotFound { path }) => {
eprintln!("Database file not found: {}", path);
}
Err(MatchyError::IoError(e)) => {
eprintln!("I/O error during validation: {}", e);
}
Err(e) => {
eprintln!("Validation error: {}", e);
}
}
}
Performance Considerations
Best Practices:
- Validate once per database, not on every open
- Cache validation results for repeated use
- Use Standard level for trusted databases when you need faster validation
- Skip validation for databases you built yourself
- Validate in background for large databases
Security Best Practices
Always Validate Untrusted Input
#![allow(unused)]
fn main() {
fn load_user_database(user_file: &Path) -> Result<Database, Box<dyn std::error::Error>> {
// ALWAYS validate user-provided files
let report = validate_database(user_file, ValidationLevel::Strict)?;
if !report.is_valid() {
return Err("Untrusted database failed validation".into());
}
Database::open(user_file).map_err(Into::into)
}
}
Limit File Size
#![allow(unused)]
fn main() {
fn validate_with_size_limit(
path: &Path,
max_size: u64,
) -> Result<ValidationReport, Box<dyn std::error::Error>> {
let metadata = std::fs::metadata(path)?;
if metadata.len() > max_size {
return Err(format!(
"Database too large: {} bytes (max: {})",
metadata.len(),
max_size
).into());
}
validate_database(path, ValidationLevel::Strict).map_err(Into::into)
}
}
Use Audit Mode for Security Review
#![allow(unused)]
fn main() {
fn security_audit(path: &Path) -> Result<(), Box<dyn std::error::Error>> {
let report = validate_database(path, ValidationLevel::Audit)?;
println!("Security Audit Report:");
println!(" Valid: {}", report.is_valid());
println!(" Unsafe code locations: {}",
report.stats.unsafe_code_locations.len());
for location in &report.stats.unsafe_code_locations {
println!(" • {} ({:?})",
location.location, location.operation);
println!(" {}", location.justification);
}
println!(" Trust assumptions: {}",
report.stats.trust_assumptions.len());
for assumption in &report.stats.trust_assumptions {
println!(" • {}", assumption.context);
println!(" Bypasses: {}", assumption.bypassed_check);
println!(" Risk: {}", assumption.risk);
}
Ok(())
}
}
See Also
- matchy validate - CLI validation command
- Error Handling - Error types and handling
- Binary Format - What gets validated
- Database Querying - Using validated databases
C API Overview
Matchy provides a stable C API for integration with C, C++, and other languages that support C FFI.
See First Database with C for a tutorial.
Design Principles
The C API follows these principles:
- Opaque handles: All Rust types are wrapped in opaque pointers
- Integer error codes: Functions return
intstatus codes - No panics: All panics are caught at the FFI boundary
- Memory safety: Clear ownership semantics for all pointers
- ABI stability: Uses
#[repr(C)]andextern "C"
Header File
#include <matchy.h>
The header is auto-generated by cbindgen during release builds:
cargo build --release
# Generates include/matchy.h
Core Types
Opaque Handles
typedef struct matchy_database matchy_database;
typedef struct matchy_builder matchy_builder;
typedef struct matchy_result matchy_result;
These are opaque pointers - never dereference them directly.
Error Codes
typedef int matchy_error_t;
#define MATCHY_OK 0
#define MATCHY_ERROR_INVALID_PARAM 1
#define MATCHY_ERROR_FILE_NOT_FOUND 2
#define MATCHY_ERROR_INVALID_FORMAT 3
#define MATCHY_ERROR_CORRUPT_DATA 4
#define MATCHY_ERROR_PATTERN_ERROR 5
#define MATCHY_ERROR_BUILD_FAILED 6
#define MATCHY_ERROR_UNKNOWN 99
Result Types
typedef enum {
MATCHY_RESULT_IP = 1,
MATCHY_RESULT_PATTERN = 2,
MATCHY_RESULT_EXACT_STRING = 3,
} matchy_result_type;
Function Groups
The C API is organized into these groups:
Database Operations
matchy_open()- Open database (default settings)matchy_open_with_options()- Open database with custom optionsmatchy_init_open_options()- Initialize option structurematchy_open()- Open database (skip validation)matchy_close()- Close databasematchy_query()- Query database (returns by value)matchy_query_into()- Query database (writes to pointer, FFI-friendly)matchy_get_stats()- Get database statisticsmatchy_clear_cache()- Clear query cache
Builder Operations
matchy_builder_new()- Create buildermatchy_builder_add_ip()- Add IP entrymatchy_builder_add_pattern()- Add pattern entrymatchy_builder_add_exact()- Add exact string entrymatchy_builder_build()- Build databasematchy_builder_free()- Free builder
Result Operations
matchy_result_type()- Get result typematchy_result_ip_prefix_len()- Get IP prefix lengthmatchy_result_pattern_count()- Get pattern countmatchy_result_free()- Free result
Extractor Operations
matchy_extractor_create()- Create extractor with flagsmatchy_extractor_extract_chunk()- Extract patterns from datamatchy_extractor_free()- Free extractormatchy_matches_free()- Free match resultsmatchy_item_type_name()- Get type name string
Error Handling Pattern
All functions return error codes:
matchy_database *db = NULL;
matchy_error_t err = matchy_open("database.mxy", &db);
if (err != MATCHY_OK) {
fprintf(stderr, "Error opening database: %d\n", err);
return 1;
}
// Use db...
matchy_close(db);
Memory Management
Ownership Rules
- Caller owns input strings - You must keep them valid during the call
- Callee owns output handles - Free them with the appropriate
_free()function - Results must be freed - Always call
matchy_result_free()
Example
// You own this string
const char *path = "database.mxy";
// Matchy owns this handle after successful open
matchy_database *db = NULL;
if (matchy_open(path, &db) == MATCHY_OK) {
// Use db...
// Matchy owns this result
matchy_result *result = NULL;
if (matchy_lookup(db, "192.0.2.1", &result) == MATCHY_OK) {
if (result != NULL) {
// Use result...
// You must free the result
matchy_result_free(result);
}
}
// You must close the database
matchy_close(db);
}
Thread Safety
- Database handles (
matchy_database) are thread-safe for reading - Builder handles (
matchy_builder) are NOT thread-safe - Result handles (
matchy_result) should not be shared
Multiple threads can safely call matchy_lookup() on the same database:
// Thread 1
matchy_result *r1 = NULL;
matchy_lookup(db, "query1", &r1);
// Thread 2 (safe!)
matchy_result *r2 = NULL;
matchy_lookup(db, "query2", &r2);
Opening with Cache Options
Basic Opening (Default Cache)
// Opens with default cache (10,000 entries)
matchy_t *db = matchy_open("database.mxy");
if (db == NULL) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
Custom Cache Configuration
// Initialize options structure
matchy_open_options_t opts;
matchy_init_open_options(&opts);
// Configure cache and validation
opts.cache_capacity = 100000; // Large cache for high repetition
matchy_t *db = matchy_open_with_options("threats.mxy", &opts);
if (db == NULL) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
No Cache
matchy_open_options_t opts;
matchy_init_open_options(&opts);
opts.cache_capacity = 0; // Disable cache
matchy_t *db = matchy_open_with_options("database.mxy", &opts);
Get Statistics
matchy_stats_t stats;
matchy_get_stats(db, &stats);
printf("Total queries: %llu\n", stats.total_queries);
printf("Queries with match: %llu\n", stats.queries_with_match);
printf("IP queries: %llu\n", stats.ip_queries);
printf("String queries: %llu\n", stats.string_queries);
// Calculate rates
double cache_hit_rate = 0.0;
if (stats.cache_hits + stats.cache_misses > 0) {
cache_hit_rate = (double)stats.cache_hits /
(stats.cache_hits + stats.cache_misses);
}
double match_rate = 0.0;
if (stats.total_queries > 0) {
match_rate = (double)stats.queries_with_match / stats.total_queries;
}
printf("Cache hit rate: %.1f%%\n", cache_hit_rate * 100.0);
printf("Match rate: %.1f%%\n", match_rate * 100.0);
matchy_stats_t Structure
typedef struct {
uint64_t total_queries;
uint64_t queries_with_match;
uint64_t queries_without_match;
uint64_t cache_hits;
uint64_t cache_misses;
uint64_t ip_queries;
uint64_t string_queries;
} matchy_stats_t;
Clear Cache
// Do some queries (fills cache)
matchy_result_t result = matchy_query(db, "example.com");
matchy_free_result(&result);
// Clear cache to force fresh lookups
matchy_clear_cache(db);
Complete Example
#include <matchy.h>
#include <stdio.h>
#include <stdlib.h>
int main(void) {
matchy_error_t err;
// Build database
matchy_builder *builder = matchy_builder_new();
if (!builder) {
fprintf(stderr, "Failed to create builder\n");
return 1;
}
err = matchy_builder_add_ip(builder, "192.0.2.1/32", NULL);
if (err != MATCHY_OK) {
fprintf(stderr, "Failed to add IP: %d\n", err);
matchy_builder_free(builder);
return 1;
}
err = matchy_builder_add_pattern(builder, "*.example.com", NULL);
if (err != MATCHY_OK) {
fprintf(stderr, "Failed to add pattern: %d\n", err);
matchy_builder_free(builder);
return 1;
}
// Build to file
err = matchy_builder_build(builder, "database.mxy");
matchy_builder_free(builder);
if (err != MATCHY_OK) {
fprintf(stderr, "Failed to build: %d\n", err);
return 1;
}
// Open database
matchy_database *db = NULL;
err = matchy_open("database.mxy", &db);
if (err != MATCHY_OK) {
fprintf(stderr, "Failed to open: %d\n", err);
return 1;
}
// Query
const char *queries[] = {
"192.0.2.1",
"test.example.com",
"notfound.com",
};
for (int i = 0; i < 3; i++) {
matchy_result *result = NULL;
err = matchy_lookup(db, queries[i], &result);
if (err != MATCHY_OK) {
fprintf(stderr, "Lookup error for '%s': %d\n", queries[i], err);
continue;
}
if (result == NULL) {
printf("%s: Not found\n", queries[i]);
} else {
matchy_result_type type = matchy_result_type(result);
printf("%s: Found (type %d)\n", queries[i], type);
matchy_result_free(result);
}
}
matchy_close(db);
return 0;
}
Compilation
GCC/Clang
gcc -o myapp myapp.c \
-I./include \
-L./target/release \
-lmatchy
Setting Library Path
# Linux
export LD_LIBRARY_PATH=./target/release:$LD_LIBRARY_PATH
# macOS
export DYLD_LIBRARY_PATH=./target/release:$DYLD_LIBRARY_PATH
Static Linking
# For static linking on Linux, you may need system libraries:
gcc -o myapp myapp.c \
-I./include \
./target/release/libmatchy.a \
-lpthread -ldl -lm
# On macOS, static linking usually just needs:
gcc -o myapp myapp.c \
-I./include \
./target/release/libmatchy.a
Best Practices
1. Always Check Return Values
if (matchy_open(path, &db) != MATCHY_OK) {
// Handle error
}
2. Initialize Pointers to NULL
matchy_database *db = NULL; // Good
matchy_open(path, &db);
3. Free Resources in Reverse Order
matchy_result *result = NULL;
matchy_database *db = NULL;
matchy_open("db.mxy", &db);
matchy_lookup(db, "query", &result);
// Free in reverse order
matchy_result_free(result);
matchy_close(db);
4. Use Guards for Cleanup
matchy_database *db = NULL;
matchy_error_t err = matchy_open(path, &db);
if (err != MATCHY_OK) goto cleanup;
// ... use db ...
cleanup:
if (db) matchy_close(db);
return err;
Debugging
Valgrind
Check for memory leaks:
valgrind --leak-check=full --show-leak-kinds=all ./myapp
AddressSanitizer
Compile with sanitizer:
gcc -fsanitize=address -g -o myapp myapp.c -lmatchy
./myapp
Extractor API
The extractor API provides high-performance pattern extraction from text data.
Extraction Flags
Use these flags with matchy_extractor_create() to specify what to extract:
MATCHY_EXTRACT_DOMAINS // Domain names (e.g., "example.com")
MATCHY_EXTRACT_EMAILS // Email addresses
MATCHY_EXTRACT_IPV4 // IPv4 addresses
MATCHY_EXTRACT_IPV6 // IPv6 addresses
MATCHY_EXTRACT_HASHES // File hashes (MD5, SHA1, SHA256, SHA384, SHA512)
MATCHY_EXTRACT_BITCOIN // Bitcoin addresses
MATCHY_EXTRACT_ETHEREUM // Ethereum addresses
MATCHY_EXTRACT_MONERO // Monero addresses
MATCHY_EXTRACT_ALL // All of the above
Item Types
Match results include an item type:
MATCHY_ITEM_TYPE_DOMAIN // Domain name
MATCHY_ITEM_TYPE_EMAIL // Email address
MATCHY_ITEM_TYPE_IPV4 // IPv4 address
MATCHY_ITEM_TYPE_IPV6 // IPv6 address
MATCHY_ITEM_TYPE_MD5 // MD5 hash
MATCHY_ITEM_TYPE_SHA1 // SHA1 hash
MATCHY_ITEM_TYPE_SHA256 // SHA256 hash
MATCHY_ITEM_TYPE_SHA384 // SHA384 hash
MATCHY_ITEM_TYPE_SHA512 // SHA512 hash
MATCHY_ITEM_TYPE_BITCOIN // Bitcoin address
MATCHY_ITEM_TYPE_ETHEREUM // Ethereum address
MATCHY_ITEM_TYPE_MONERO // Monero address
Functions
matchy_extractor_create(flags)- Create extractor with specified flagsmatchy_extractor_extract_chunk(extractor, data, len, matches)- Extract patternsmatchy_extractor_free(extractor)- Free extractormatchy_matches_free(matches)- Free match resultsmatchy_item_type_name(type)- Get string name for item type
Example
#include <matchy.h>
#include <stdio.h>
#include <string.h>
int main() {
// Create extractor for domains and IPs only
matchy_extractor_t *ext = matchy_extractor_create(
MATCHY_EXTRACT_DOMAINS | MATCHY_EXTRACT_IPV4 | MATCHY_EXTRACT_IPV6
);
if (!ext) {
fprintf(stderr, "Failed to create extractor\n");
return 1;
}
// Extract from text
const char *text = "Check evil.com and 192.168.1.1";
matchy_matches_t matches;
int err = matchy_extractor_extract_chunk(
ext,
(const uint8_t *)text,
strlen(text),
&matches
);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Extraction failed: %d\n", err);
matchy_extractor_free(ext);
return 1;
}
// Process results
for (size_t i = 0; i < matches.count; i++) {
printf("%s: %s (bytes %zu-%zu)\n",
matchy_item_type_name(matches.items[i].item_type),
matches.items[i].value,
matches.items[i].start,
matches.items[i].end);
}
// Cleanup
matchy_matches_free(&matches);
matchy_extractor_free(ext);
return 0;
}
Output:
Domain: evil.com (bytes 6-14)
IPv4: 192.168.1.1 (bytes 19-30)
Match Structure
typedef struct matchy_match_t {
uint8_t item_type; // MATCHY_ITEM_TYPE_* constant
const char *value; // Extracted value (null-terminated)
size_t start; // Start byte offset in input
size_t end; // End byte offset (exclusive)
} matchy_match_t;
typedef struct matchy_matches_t {
const matchy_match_t *items; // Array of matches
size_t count; // Number of matches
} matchy_matches_t;
Thread Safety
- Extractor handles (
matchy_extractor_t*) are thread-safe for concurrent extraction - Multiple threads can safely call
matchy_extractor_extract_chunk()on the same extractor - Each thread should have its own
matchy_matches_tfor results
See Also
- C Database Functions - Database operations
- C Builder Functions - Builder operations
- C Result Functions - Result operations
- First C Tutorial
Building Databases from C
This page documents the C API functions for building Matchy databases.
Overview
Building a database in C involves three steps:
- Create a builder with
matchy_builder_new() - Add entries with
matchy_builder_add_*()functions - Build and save with
matchy_builder_build()
#include <matchy.h>
matchy_builder_t *builder = matchy_builder_new();
matchy_builder_add_ip(builder, "192.0.2.1/32", NULL);
matchy_builder_add_pattern(builder, "*.example.com", NULL);
matchy_error_t err = matchy_builder_build(builder, "database.mxy");
matchy_builder_free(builder);
Builder Functions
matchy_builder_new
matchy_builder_t *matchy_builder_new(void);
Creates a new database builder.
Returns: Builder handle, or NULL on error
Example:
matchy_builder_t *builder = matchy_builder_new();
if (!builder) {
fprintf(stderr, "Failed to create builder\n");
return 1;
}
Memory: Caller must free with matchy_builder_free()
matchy_builder_free
void matchy_builder_free(matchy_builder_t *builder);
Frees a builder and all its resources.
Parameters:
builder- Builder to free (may beNULL)
Example:
matchy_builder_free(builder);
builder = NULL; // Good practice
Note: After calling this, the builder handle must not be used.
Adding Entries
matchy_builder_add_ip
matchy_error_t matchy_builder_add_ip(
matchy_builder_t *builder,
const char *ip_cidr,
const char *data_json
);
Adds an IP address or CIDR range to the database.
Parameters:
builder- Builder handleip_cidr- IP address or CIDR (e.g., “192.0.2.1” or “10.0.0.0/8”)data_json- Associated data as JSON string, orNULL
Returns: MATCHY_SUCCESS or error code
Example:
// IP without data
err = matchy_builder_add_ip(builder, "8.8.8.8", NULL);
// IP with data
err = matchy_builder_add_ip(builder, "192.0.2.1/32",
"{\"country\":\"US\",\"asn\":15169}");
// CIDR range
err = matchy_builder_add_ip(builder, "10.0.0.0/8",
"{\"type\":\"private\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add IP\n");
}
Valid formats:
- IPv4:
"192.0.2.1","10.0.0.0/8" - IPv6:
"2001:db8::1","2001:db8::/32"
matchy_builder_add_pattern
matchy_error_t matchy_builder_add_pattern(
matchy_builder_t *builder,
const char *pattern,
const char *data_json
);
Adds a glob pattern to the database.
Parameters:
builder- Builder handlepattern- Glob pattern stringdata_json- Associated data as JSON, orNULL
Returns: MATCHY_SUCCESS or error code
Example:
// Simple wildcard
err = matchy_builder_add_pattern(builder, "*.google.com", NULL);
// With data
err = matchy_builder_add_pattern(builder, "mail.*",
"{\"category\":\"email\",\"priority\":10}");
// Character class
err = matchy_builder_add_pattern(builder, "test[123].com", NULL);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Invalid pattern\n");
}
Pattern syntax:
*- Matches any characters?- Matches single character[abc]- Matches any of a, b, c[!abc]- Matches anything except a, b, c
matchy_builder_add_exact
matchy_error_t matchy_builder_add_exact(
matchy_builder_t *builder,
const char *string,
const char *data_json
);
Adds an exact string match to the database.
Parameters:
builder- Builder handlestring- Exact string to matchdata_json- Associated data as JSON, orNULL
Returns: MATCHY_SUCCESS or error code
Example:
// Exact match
err = matchy_builder_add_exact(builder, "example.com", NULL);
// With data
err = matchy_builder_add_exact(builder, "api.example.com",
"{\"endpoint\":\"api\",\"rate_limit\":1000}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add string\n");
}
Note: Exact matches are faster than patterns. Use them when possible.
Building the Database
matchy_builder_build
matchy_error_t matchy_builder_build(
matchy_builder_t *builder,
const char *output_path
);
Builds the database and writes it to a file.
Parameters:
builder- Builder handleoutput_path- Path where database file will be written
Returns: MATCHY_SUCCESS or error code
Example:
err = matchy_builder_build(builder, "database.mxy");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Build failed\n");
return 1;
}
printf("Database written to database.mxy\n");
Notes:
- File is created or overwritten
- Build process compiles all entries into optimized format
- Builder can be reused after building
Complete Example
#include <matchy.h>
#include <stdio.h>
#include <stdlib.h>
int main(void) {
matchy_error_t err;
// Create builder
matchy_builder_t *builder = matchy_builder_new();
if (!builder) {
fprintf(stderr, "Failed to create builder\n");
return 1;
}
// Add IP entries
err = matchy_builder_add_ip(builder, "192.0.2.1/32",
"{\"country\":\"US\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add IP\n");
goto cleanup;
}
err = matchy_builder_add_ip(builder, "10.0.0.0/8",
"{\"type\":\"private\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add CIDR\n");
goto cleanup;
}
// Add patterns
err = matchy_builder_add_pattern(builder, "*.google.com",
"{\"category\":\"search\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add pattern\n");
goto cleanup;
}
err = matchy_builder_add_pattern(builder, "mail.*",
"{\"category\":\"email\"}");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add pattern\n");
goto cleanup;
}
// Add exact strings
err = matchy_builder_add_exact(builder, "example.com", NULL);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add exact string\n");
goto cleanup;
}
// Build database
err = matchy_builder_build(builder, "my_database.mxy");
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Build failed\n");
goto cleanup;
}
printf("✓ Database built successfully: my_database.mxy\n");
cleanup:
matchy_builder_free(builder);
return (err == MATCHY_SUCCESS) ? 0 : 1;
}
Compilation:
gcc -o build_db build_db.c -lmatchy
./build_db
Data Format
JSON Data Structure
Data is passed as JSON strings:
{
"key1": "string_value",
"key2": 42,
"key3": 3.14,
"key4": true,
"key5": ["array", "values"],
"key6": {
"nested": "object"
}
}
Supported types:
- Strings
- Numbers (integers, floats)
- Booleans (
true/false) - Arrays
- Objects (nested maps)
null
Example with Complex Data
const char *geo_data =
"{"
" \"country\": \"US\","
" \"city\": \"Mountain View\","
" \"coords\": {"
" \"lat\": 37.386,"
" \"lon\": -122.084"
" },"
" \"tags\": [\"datacenter\", \"cloud\"]"
"}";
matchy_builder_add_ip(builder, "8.8.8.8", geo_data);
Error Handling
Error Codes
| Code | Constant | Meaning |
|---|---|---|
| 0 | MATCHY_SUCCESS | Operation succeeded |
| -1 | MATCHY_ERROR_FILE_NOT_FOUND | File not found |
| -2 | MATCHY_ERROR_INVALID_FORMAT | Invalid format |
| -3 | MATCHY_ERROR_CORRUPT_DATA | Data corruption |
| -4 | MATCHY_ERROR_OUT_OF_MEMORY | Out of memory |
| -5 | MATCHY_ERROR_INVALID_PARAM | Invalid parameter |
| -6 | MATCHY_ERROR_IO | I/O error |
Checking Errors
err = matchy_builder_add_ip(builder, ip, data);
if (err != MATCHY_SUCCESS) {
switch (err) {
case MATCHY_ERROR_INVALID_PARAM:
fprintf(stderr, "Invalid IP address: %s\n", ip);
break;
case MATCHY_ERROR_OUT_OF_MEMORY:
fprintf(stderr, "Out of memory\n");
break;
default:
fprintf(stderr, "Error: %d\n", err);
}
}
Best Practices
1. Always Check Returns
if (matchy_builder_add_ip(builder, ip, data) != MATCHY_SUCCESS) {
// Handle error
}
2. Use Cleanup Labels
matchy_builder_t *builder = NULL;
matchy_error_t err;
builder = matchy_builder_new();
if (!builder) goto cleanup;
err = matchy_builder_add_ip(builder, "192.0.2.1", NULL);
if (err != MATCHY_SUCCESS) goto cleanup;
// ... more operations ...
cleanup:
if (builder) matchy_builder_free(builder);
return err;
3. Validate Input
if (!ip || strlen(ip) == 0) {
fprintf(stderr, "Empty IP address\n");
return MATCHY_ERROR_INVALID_PARAM;
}
err = matchy_builder_add_ip(builder, ip, data);
4. Batch Operations
const char *ips[] = {
"192.0.2.1",
"10.0.0.1",
"172.16.0.1",
NULL
};
for (int i = 0; ips[i]; i++) {
err = matchy_builder_add_ip(builder, ips[i], NULL);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Failed to add IP %s\n", ips[i]);
// Continue or abort based on requirements
}
}
Thread Safety
Builders are NOT thread-safe. Do not call builder functions from multiple threads simultaneously.
// WRONG: Don't do this
#pragma omp parallel for
for (int i = 0; i < n; i++) {
matchy_builder_add_ip(builder, ips[i], NULL); // Data race!
}
// RIGHT: Use a single thread for building
for (int i = 0; i < n; i++) {
matchy_builder_add_ip(builder, ips[i], NULL);
}
Performance Tips
1. Pre-allocate When Possible
If you know approximately how many entries you’ll add, building is more efficient.
2. Order Doesn’t Matter
Entries can be added in any order - the builder optimizes internally.
3. Reuse Builders
Builders can be reused after building:
matchy_builder_build(builder, "db1.mxy");
// Builder is still valid, can add more entries
matchy_builder_add_ip(builder, "1.2.3.4", NULL);
matchy_builder_build(builder, "db2.mxy");
4. Build Time
Building time depends on entry count:
- 1,000 entries: ~10ms
- 10,000 entries: ~50ms
- 100,000 entries: ~500ms
- 1,000,000 entries: ~5s
See Also
- C API Overview - C API introduction
- Querying from C - Query databases
- Memory Management - Memory handling
- First Database with C - Tutorial
C Querying
Query operations and result handling in the Matchy C API.
Overview
The C API provides functions to open databases and perform queries against IPs, strings, and patterns. All query functions are thread-safe for concurrent reads.
Opening Databases
Open from File
matchy_t *matchy_open(const char *filename);
Opens a database file with validation:
- Memory-maps the file
- Validates MMDB structure
- Checks PARAGLOB section
- Validates all UTF-8 strings
Returns NULL on error.
Example:
matchy_t *db = matchy_open("database.mxy");
if (!db) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
// Use db...
matchy_close(db);
Open from Buffer
matchy_t *matchy_open_buffer(const uint8_t *buffer, uintptr_t size);
Opens a database from memory:
- Buffer must remain valid for database lifetime
- No file I/O required
- Useful for embedded databases
Example:
uint8_t *buffer = load_database_somehow();
uintptr_t size = get_database_size();
matchy_t *db = matchy_open_buffer(buffer, size);
if (!db) {
free(buffer);
return 1;
}
// Query db...
matchy_close(db);
free(buffer); // Safe to free after close
Query Operations
Unified Lookup
int32_t matchy_lookup(matchy_t *db,
const char *text,
matchy_result_t **result);
Queries the database with automatic type detection:
- IP address: Parses as IPv4 or IPv6
- Domain/string: Searches patterns and exact strings
- Other text: Pattern matching only
Returns:
MATCHY_SUCCESS(0) on success- Error code on failure
*resultset toNULLif no match
Example:
matchy_result_t *result = NULL;
int32_t err = matchy_lookup(db, "192.0.2.1", &result);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Query error: %d\n", err);
return 1;
}
if (result != NULL) {
printf("Match found!\n");
matchy_free_result(result);
} else {
printf("No match\n");
}
IP Lookup
int32_t matchy_lookup_ip(matchy_t *db,
struct sockaddr *addr,
matchy_result_t **result);
Direct IP lookup using sockaddr:
- Supports IPv4 (
sockaddr_in) - Supports IPv6 (
sockaddr_in6) - Faster than parsing text
Example:
struct sockaddr_in addr = {0};
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = inet_addr("192.0.2.1");
matchy_result_t *result = NULL;
int32_t err = matchy_lookup_ip(db, (struct sockaddr *)&addr, &result);
if (err == MATCHY_SUCCESS && result) {
// Process result...
matchy_free_result(result);
}
String Lookup
int32_t matchy_lookup_string(matchy_t *db,
const char *text,
matchy_result_t **result);
Pattern and exact string matching:
- Searches glob patterns
- Searches exact string table
- Returns first match
Example:
matchy_result_t *result = NULL;
int32_t err = matchy_lookup_string(db, "test.example.com", &result);
if (err == MATCHY_SUCCESS && result) {
printf("Matched pattern or exact string\n");
matchy_free_result(result);
}
Result Handling
Get Result Type
uint32_t matchy_result_type(const matchy_result_t *result);
Returns the match type:
MATCHY_RESULT_IP(1) - IP address matchMATCHY_RESULT_PATTERN(2) - Pattern matchMATCHY_RESULT_EXACT_STRING(3) - Exact string match
Example:
uint32_t type = matchy_result_type(result);
switch (type) {
case MATCHY_RESULT_IP:
printf("IP match\n");
break;
case MATCHY_RESULT_PATTERN:
printf("Pattern match\n");
break;
case MATCHY_RESULT_EXACT_STRING:
printf("Exact string match\n");
break;
}
Get Entry Data
int32_t matchy_result_get_entry(const matchy_result_t *result,
matchy_entry_s *entry);
Extracts structured data from the result:
Example:
matchy_entry_s entry = {0};
if (matchy_result_get_entry(result, &entry) == MATCHY_SUCCESS) {
// Entry contains structured data
// See Data Types Reference for details
}
Extract Entry Data
int32_t matchy_aget_value(const matchy_entry_s *entry,
matchy_entry_data_t *data,
const char *const *path);
Navigates structured data:
Example:
matchy_entry_s entry = {0};
matchy_result_get_entry(result, &entry);
const char *path[] = {"metadata", "country", NULL};
matchy_entry_data_t data = {0};
if (matchy_aget_value(&entry, &data, path) == MATCHY_SUCCESS) {
if (data.type == MATCHY_DATA_TYPE_UTF8_STRING) {
printf("Country: %s\n", data.value.utf8_string);
}
}
Complete Examples
Single Query
#include <matchy/matchy.h>
#include <stdio.h>
int main(void) {
// Open database
matchy_t *db = matchy_open("database.mxy");
if (!db) {
fprintf(stderr, "Failed to open database\n");
return 1;
}
// Query
matchy_result_t *result = NULL;
int32_t err = matchy_lookup(db, "192.0.2.1", &result);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Query failed: %d\n", err);
matchy_close(db);
return 1;
}
if (result) {
printf("Match found!\n");
matchy_free_result(result);
} else {
printf("No match\n");
}
matchy_close(db);
return 0;
}
Batch Queries
void batch_query(matchy_t *db, const char **queries, size_t count) {
for (size_t i = 0; i < count; i++) {
matchy_result_t *result = NULL;
if (matchy_lookup(db, queries[i], &result) == MATCHY_SUCCESS) {
if (result) {
printf("%s: MATCH\n", queries[i]);
matchy_free_result(result);
} else {
printf("%s: no match\n", queries[i]);
}
}
}
}
Multi-threaded Queries
#include <pthread.h>
struct query_args {
matchy_t *db;
const char *query;
};
void *query_thread(void *arg) {
struct query_args *args = arg;
matchy_result_t *result = NULL;
if (matchy_lookup(args->db, args->query, &result) == MATCHY_SUCCESS) {
if (result) {
printf("[%ld] Match: %s\n",
(long)pthread_self(), args->query);
matchy_free_result(result);
}
}
return NULL;
}
int main(void) {
matchy_t *db = matchy_open("database.mxy");
if (!db) return 1;
pthread_t threads[4];
struct query_args args[4] = {
{db, "192.0.2.1"},
{db, "10.0.0.1"},
{db, "example.com"},
{db, "*.test.com"}
};
// Spawn threads (safe: db is thread-safe for reads)
for (int i = 0; i < 4; i++) {
pthread_create(&threads[i], NULL, query_thread, &args[i]);
}
// Wait for completion
for (int i = 0; i < 4; i++) {
pthread_join(threads[i], NULL);
}
matchy_close(db);
return 0;
}
Performance Tips
1. Reuse Database Handle
❌ Slow:
for (int i = 0; i < 1000; i++) {
matchy_t *db = matchy_open("database.mxy");
matchy_lookup(db, queries[i], &result);
matchy_close(db);
}
✅ Fast:
matchy_t *db = matchy_open("database.mxy");
for (int i = 0; i < 1000; i++) {
matchy_lookup(db, queries[i], &result);
if (result) matchy_free_result(result);
}
matchy_close(db);
2. Free Results Promptly
matchy_result_t *result = NULL;
matchy_lookup(db, query, &result);
if (result) {
// Extract what you need
uint32_t type = matchy_result_type(result);
// Free immediately
matchy_free_result(result);
}
3. Use Direct IP Lookup
❌ Slower:
matchy_lookup(db, "192.0.2.1", &result); // Parses string
✅ Faster:
struct sockaddr_in addr = /* ... */;
matchy_lookup_ip(db, (struct sockaddr *)&addr, &result); // Direct
Error Handling
Check All Return Codes
matchy_t *db = matchy_open(filename);
if (!db) {
fprintf(stderr, "Open failed\n");
return 1;
}
matchy_result_t *result = NULL;
int32_t err = matchy_lookup(db, query, &result);
if (err != MATCHY_SUCCESS) {
fprintf(stderr, "Lookup failed: %d\n", err);
matchy_close(db);
return 1;
}
// Check for no match
if (!result) {
printf("No match found\n");
}
matchy_close(db);
Common Error Codes
MATCHY_SUCCESS(0) - SuccessMATCHY_ERROR_INVALID_PARAM(-5) - NULL parameterMATCHY_ERROR_FILE_NOT_FOUND(-1) - File doesn’t existMATCHY_ERROR_INVALID_FORMAT(-2) - Corrupt databaseMATCHY_ERROR_CORRUPT_DATA(-3) - Data integrity error
Thread Safety
Safe: Concurrent Queries
// Thread 1
matchy_lookup(db, "query1", &r1);
// Thread 2 (safe!)
matchy_lookup(db, "query2", &r2);
Unsafe: Query During Close
// Thread 1: Querying
matchy_lookup(db, query, &result);
// Thread 2: Closing (RACE CONDITION!)
matchy_close(db);
Pattern: Thread-Safe Queries
// Main thread
matchy_t *db = matchy_open("database.mxy");
// Spawn worker threads
// ... all threads use db safely ...
// Wait for all threads to finish
// ... join threads ...
// Only then close
matchy_close(db);
See Also
- C Memory Management - Cleanup and lifetimes
- C API Overview - API design
- Data Types Reference - Structured data handling
- Error Handling Reference - Error codes
C Memory Management
Comprehensive guide to memory management in the Matchy C API.
Overview
The Matchy C API uses opaque handles to manage Rust objects safely from C. Understanding the ownership and lifetime rules is critical for preventing memory leaks and use-after-free bugs.
Core Principles
1. Ownership Model
- Caller owns input strings - Keep them valid for the duration of the function call
- Callee owns output handles - The library manages the underlying memory
- Explicit cleanup required - Always call the matching
_free()or_close()function
2. No Double-Free
Once you call a cleanup function, the handle is invalid:
matchy_builder_free(builder);
builder = NULL; // Good practice: prevent use-after-free
3. Memory Lifetime
Handles remain valid until explicitly freed, even if the creating function returns.
Cleanup Functions
Database Handles
void matchy_close(matchy_t *db);
Closes a database and frees associated resources:
- Unmaps the database file
- Releases internal buffers
- Invalidates the handle
When to call: After you’re done querying the database
matchy_t *db = NULL;
if (matchy_open("database.mxy", &db) == MATCHY_SUCCESS) {
// Use db for queries...
matchy_close(db);
db = NULL; // Good practice
}
Builder Handles
void matchy_builder_free(matchy_builder_t *builder);
Frees a builder and all associated entries:
- Releases all added entries
- Frees internal build state
- Invalidates the handle
When to call: After building or if build fails
matchy_builder_t *builder = matchy_builder_new();
if (builder) {
matchy_builder_add(builder, "key", NULL);
// ... build or error ...
matchy_builder_free(builder);
builder = NULL;
}
Result Handles
void matchy_free_result(matchy_result_t *result);
Query results use a zero-allocation design - data is decoded on-demand from memory-mapped storage. This function is a no-op but should still be called for API consistency and forward compatibility.
When to call: After you’re done with the result (for code clarity and future compatibility)
matchy_result_t *result = NULL;
int32_t err = matchy_lookup(db, "192.0.2.1", &result);
if (err == MATCHY_SUCCESS && result != NULL) {
// Extract data from result...
matchy_free_result(result);
result = NULL;
}
String Handles
void matchy_free_string(char *string);
Frees strings allocated by the library (e.g., error messages, validation results):
When to call: After using library-allocated strings
char *error_msg = NULL;
int32_t err = matchy_validate("file.mxy", MATCHY_VALIDATION_STRICT, &error_msg);
if (err != MATCHY_SUCCESS && error_msg != NULL) {
fprintf(stderr, "Validation error: %s\n", error_msg);
matchy_free_string(error_msg);
}
Entry Data Lists
void matchy_free_entry_data_list(matchy_entry_data_list_t *list);
Frees structured data query results:
When to call: After processing entry data
matchy_entry_data_list_t *list = NULL;
if (matchy_get_entry_data_list(entry, &list) == MATCHY_SUCCESS) {
// Process list...
matchy_free_entry_data_list(list);
}
Common Patterns
Pattern 1: Single Query
void query_once(const char *db_path, const char *query) {
matchy_t *db = NULL;
// Open database
if (matchy_open(db_path, &db) != MATCHY_SUCCESS) {
return;
}
// Query
matchy_result_t *result = NULL;
if (matchy_lookup(db, query, &result) == MATCHY_SUCCESS) {
if (result != NULL) {
// Use result...
matchy_free_result(result);
}
}
// Cleanup
matchy_close(db);
}
Pattern 2: Multiple Queries
void query_many(const char *db_path, const char **queries, size_t count) {
matchy_t *db = NULL;
if (matchy_open(db_path, &db) != MATCHY_SUCCESS) {
return;
}
// Reuse database handle for multiple queries
for (size_t i = 0; i < count; i++) {
matchy_result_t *result = NULL;
if (matchy_lookup(db, queries[i], &result) == MATCHY_SUCCESS) {
if (result != NULL) {
// Use result...
matchy_free_result(result);
}
}
}
matchy_close(db);
}
Pattern 3: Build and Query
int build_and_query(void) {
matchy_builder_t *builder = matchy_builder_new();
if (!builder) {
return -1;
}
// Build
matchy_builder_add(builder, "key", "{\"value\": 42}");
uint8_t *buffer = NULL;
uintptr_t size = 0;
int32_t err = matchy_builder_build(builder, &buffer, &size);
// Builder no longer needed
matchy_builder_free(builder);
if (err != MATCHY_SUCCESS) {
return -1;
}
// Open from buffer
matchy_t *db = NULL;
err = matchy_open_buffer(buffer, size, &db);
if (err != MATCHY_SUCCESS) {
free(buffer);
return -1;
}
// Query
matchy_result_t *result = NULL;
matchy_lookup(db, "key", &result);
if (result) {
matchy_free_result(result);
}
matchy_close(db);
free(buffer);
return 0;
}
Error Handling
Early Returns
Always cleanup on error paths:
matchy_t *db = NULL;
if (matchy_open(path, &db) != MATCHY_SUCCESS) {
return -1; // Nothing to cleanup
}
matchy_result_t *result = NULL;
if (matchy_lookup(db, query, &result) != MATCHY_SUCCESS) {
matchy_close(db); // Must cleanup db!
return -1;
}
// Use result...
matchy_free_result(result);
matchy_close(db);
return 0;
Goto Cleanup Pattern
For complex functions:
int process(const char *path) {
matchy_t *db = NULL;
matchy_result_t *result = NULL;
int ret = -1;
if (matchy_open(path, &db) != MATCHY_SUCCESS) {
goto cleanup;
}
if (matchy_lookup(db, "query", &result) != MATCHY_SUCCESS) {
goto cleanup;
}
// Success path
ret = 0;
cleanup:
if (result) matchy_free_result(result);
if (db) matchy_close(db);
return ret;
}
Thread Safety
Database Handles
Thread-safe for concurrent reads:
// Thread 1
matchy_result_t *r1 = NULL;
matchy_lookup(db, "query1", &r1); // Safe
matchy_free_result(r1);
// Thread 2 (concurrent, safe)
matchy_result_t *r2 = NULL;
matchy_lookup(db, "query2", &r2); // Safe
matchy_free_result(r2);
Not safe for concurrent close:
// Thread 1: Querying
matchy_lookup(db, "query", &result);
// Thread 2: Closing (UNSAFE!)
matchy_close(db); // Race condition!
Builder Handles
Not thread-safe - use from a single thread:
// UNSAFE:
matchy_builder_t *builder = matchy_builder_new();
// Thread 1
matchy_builder_add(builder, "key1", NULL);
// Thread 2
matchy_builder_add(builder, "key2", NULL); // Race condition!
Result Handles
Not thread-safe - each thread needs its own:
// Safe: Each thread has its own result
void *thread1(void *arg) {
matchy_t *db = arg;
matchy_result_t *result = NULL;
matchy_lookup(db, "query1", &result);
matchy_free_result(result);
return NULL;
}
void *thread2(void *arg) {
matchy_t *db = arg;
matchy_result_t *result = NULL;
matchy_lookup(db, "query2", &result);
matchy_free_result(result);
return NULL;
}
Common Mistakes
Mistake 1: Forgetting to Free
❌ Wrong:
matchy_t *db = NULL;
matchy_open("database.mxy", &db);
// ... use db ...
// Forgot to close! Memory/file handle leak
✅ Correct:
matchy_t *db = NULL;
matchy_open("database.mxy", &db);
// ... use db ...
matchy_close(db);
db = NULL;
Note: Query results (
matchy_result_t) use zero-allocation design, so forgettingmatchy_free_result()won’t leak memory. However, calling it is still recommended for code clarity and forward compatibility.
Mistake 2: Use After Free
❌ Wrong:
matchy_result_t *result = NULL;
matchy_lookup(db, "query", &result);
matchy_free_result(result);
// Use after free!
int type = matchy_result_type(result);
✅ Correct:
matchy_result_t *result = NULL;
matchy_lookup(db, "query", &result);
int type = matchy_result_type(result);
matchy_free_result(result);
result = NULL; // Good practice
Mistake 3: Double Free
For most handles, double-free causes undefined behavior:
❌ Wrong:
matchy_close(db);
matchy_close(db); // Double free! Undefined behavior
✅ Correct:
if (db) {
matchy_close(db);
db = NULL;
}
Note:
matchy_free_result()is safe to call multiple times (it’s a no-op), but setting pointers to NULL after cleanup is still good practice for consistency.
Mistake 4: Missing Cleanup on Error
❌ Wrong:
matchy_t *db = NULL;
matchy_open(path, &db);
matchy_result_t *result = NULL;
if (matchy_lookup(db, query, &result) != MATCHY_SUCCESS) {
return -1; // Leak! Didn't close db
}
✅ Correct:
matchy_t *db = NULL;
matchy_open(path, &db);
matchy_result_t *result = NULL;
if (matchy_lookup(db, query, &result) != MATCHY_SUCCESS) {
matchy_close(db);
return -1;
}
Valgrind Testing
Use Valgrind to detect memory issues:
valgrind --leak-check=full \
--show-leak-kinds=all \
--track-origins=yes \
./your_program
A clean run should show:
HEAP SUMMARY:
in use at exit: 0 bytes in 0 blocks
total heap usage: X allocs, X frees, Y bytes allocated
All heap blocks were freed -- no leaks are possible
See Also
- C API Overview - API design and principles
- C Querying - Query operations
- Building with C - Compilation and linking
- Error Handling Reference - Error codes and handling
Binary Format Specification
Detailed binary format specification for Matchy databases.
Matchy databases use the MaxMind DB (MMDB) format with optional extensions for string and pattern matching.
Overview
The format has three main components:
- MMDB Section: Standard MaxMind DB format for IP address lookups
- PARAGLOB Section: Optional extension for glob pattern matching
- String Literals Hash Section: Optional extension for exact string matching
All components coexist in a single .mxy file.
File Structure
Note: The MMDB format is unusual - it has no header or magic bytes at the start. The file begins directly with the IP search tree, and all metadata is stored at the end of the file.
┌─────────────────────────────────────────────────────────┐
│ IP Search Tree (Binary Trie) │ Starts at byte 0
├─────────────────────────────────────────────────────────┤
│ 16-byte separator │
├─────────────────────────────────────────────────────────┤
│ Data Section (Shared) │ MMDB data values
├─────────────────────────────────────────────────────────┤
│ MMDB_PATTERN separator (optional) │ "MMDB_PATTERN\x00\x00\x00\x00"
├─────────────────────────────────────────────────────────┤
│ PARAGLOB SECTION (optional) │ Glob pattern matching
├─────────────────────────────────────────────────────────┤
│ MMDB_LITERAL separator (optional) │ "MMDB_LITERAL\x00\x00\x00\x00"
├─────────────────────────────────────────────────────────┤
│ STRING LITERALS HASH SECTION (optional) │ O(1) exact string lookups
├─────────────────────────────────────────────────────────┤
│ Metadata Marker │ "\xAB\xCD\xEFMaxMind.com"
├─────────────────────────────────────────────────────────┤
│ MMDB Metadata (within last 128KB) │ node_count, record_size, etc.
└─────────────────────────────────────────────────────────┘
Section Descriptions
IP Search Tree: Binary trie for IP address lookups. This is the first data in the file (offset 0). The tree structure depends on metadata fields that are only available after parsing the metadata at the end of the file.
Data Section: Shared MMDB-encoded data values referenced by all query types (IP, pattern, and literal lookups).
PARAGLOB Section: Optional section for glob pattern matching. Only present if the database contains patterns with wildcards (e.g., *.example.com).
String Literals Hash Section: Optional hash table for O(1) exact string matching. Only present if the database contains literal strings (non-wildcard patterns).
MMDB Metadata: Contains essential database information:
node_count: Number of nodes in the IP search treerecord_size: Size of tree records (24, 28, or 32 bits)ip_version: IPv4 (4) or IPv6 (6)pattern_section_offset: Offset to PARAGLOB section (0 if absent)literal_section_offset: Offset to literal hash section (0 if absent)- Build timestamp, database type, description, etc.
The metadata marker (\xAB\xCD\xEFMaxMind.com) is located within the last 128KB of the file. Parsers search backwards from the end to find it.
MMDB Section
The file follows the standard MaxMind DB format:
- See MaxMind DB Spec
Key characteristics:
- No header at start of file
- File begins with IP search tree data at offset 0
- Metadata stored at end of file for fast tail access
- Memory-mappable with zero-copy access
Metadata
Standard MMDB metadata map at the end of the file (after metadata marker):
{
"binary_format_major_version": 2,
"binary_format_minor_version": 0,
"build_epoch": 1234567890,
"database_type": "Matchy",
"description": {
"en": "Matchy unified database"
},
"ip_version": 6,
"node_count": 12345,
"record_size": 28
}
Search Tree
Binary trie for IP address lookups:
- Node size: 7 bytes (28-bit pointers × 2)
- Record size: 28 bits per record
- Addressing: Supports up to 256M nodes
Each node contains two 28-bit pointers (left/right):
Node (7 bytes):
├─ Left pointer (28 bits) → next node or data
└─ Right pointer (28 bits) → next node or data
Data Section
MMDB-format data types:
| Type | Code | Size | Notes |
|---|---|---|---|
| Pointer | 1 | Variable | Offset into data section |
| String | 2 | Variable | UTF-8 text |
| Double | 3 | 8 bytes | IEEE 754 |
| Bytes | 4 | Variable | Binary data |
| Uint16 | 5 | 2 bytes | Unsigned integer |
| Uint32 | 6 | 4 bytes | Unsigned integer |
| Map | 7 | Variable | Key-value pairs |
| Int32 | 8 | 4 bytes | Signed integer |
| Uint64 | 9 | 8 bytes | Unsigned integer |
| Boolean | 14 | 0 bytes | Value in type byte |
| Float | 15 | 4 bytes | IEEE 754 |
| Array | 11 | Variable | Ordered list |
| Timestamp | 128 | 8 bytes | Matchy extension (Unix epoch seconds) |
See MaxMind DB Format for encoding details.
Matchy Extended Types
Matchy extends the MMDB format with additional types using codes 128+:
| Type | Code | Size | Notes |
|---|---|---|---|
| Timestamp | 128 | 8 bytes | Unix epoch seconds (signed i64) |
These types are stored using the MMDB extended type mechanism (raw byte = code - 7). Timestamp values are serialized to JSON as ISO 8601 strings (e.g., 2025-10-02T18:44:31Z) for human readability while stored compactly as 8 bytes instead of 27-byte strings.
PARAGLOB Section Format
When glob patterns are present, the PARAGLOB section contains:
#![allow(unused)]
fn main() {
#[repr(C)]
struct ParaglobHeader {
magic: [u8; 8], // "PARAGLOB"
version: u32, // Format version (currently 5)
match_mode: u32, // 0=CaseSensitive, 1=CaseInsensitive
ac_node_count: u32, // Number of AC automaton nodes
ac_nodes_offset: u32, // Offset to node array
// ... additional fields for pattern data
}
}
Followed by:
- Aho-Corasick automaton nodes and edges
- Pattern metadata entries
- Glob segment data
- Pattern-to-data mappings
See matchy-format/src/offset_format.rs for the complete ParaglobHeader structure (112 bytes in v5).
String Literals Hash Section Format (Version 2)
When literal strings are present, a hash table section provides O(1) lookups using 96-bit truncated XXH3 hashes:
#![allow(unused)]
fn main() {
#[repr(C)]
struct LiteralHashHeader {
magic: [u8; 4], // "LHSH"
version: u32, // 2
entry_count: u32, // Number of patterns
table_size: u32, // Hash table capacity
reserved1: u32, // Reserved (was strings_offset in v1)
reserved2: u32, // Reserved (was strings_size in v1)
num_shards: u32, // Number of shards (power of 2)
shard_bits: u32, // Bits used for sharding
}
#[repr(C)]
struct HashEntry {
hash: [u8; 12], // 96-bit truncated XXH3_128
pattern_id: u32, // Pattern ID for data lookup
}
}
Key characteristics:
- Hash-only storage: Original strings are not stored (privacy-preserving)
- 96-bit hashes: Negligible collision probability (< 10⁻²⁴ per query)
- Sharded construction: Parallel building for large datasets
- 16-byte entries: Same size as v1, but ~50% smaller total (no string pool)
See matchy-literal-hash crate for implementation details.
Data Alignment
All structures are aligned:
- Header: 8-byte alignment
- Nodes: 8-byte alignment
- Edges: 4-byte alignment
- Hash buckets: 4-byte alignment
Padding bytes are zeros.
Offset Encoding
All offsets are relative to the start of the PARAGLOB section:
File offset = PARAGLOB_SECTION_START + relative_offset
Special values:
0x00000000= NULL pointer0xFFFFFFFF= Invalid/end marker
Version History
Version 5 (Current)
- Serialized glob segments for zero-copy loading
- Optimized memory layout with ACNodeHot (16 bytes)
- Support for patterns, exact strings, and IP addresses
- Aho-Corasick automaton for pattern matching
- Separate hash table for exact literal matches
- Embedded MMDB data format
Previous Versions
- v4: ACNodeHot (20-byte) for 50% memory reduction
- v3: AC literal mapping for O(1) zero-copy loading
- v2: Data section support for pattern-associated data
- v1: Original format, patterns only
Format Validation
Matchy validates these invariants on load:
- Magic bytes match: “\xAB\xCD\xEFMaxMind.com” at end, “PARAGLOB” if pattern section present
- Version supported: PARAGLOB version 5 currently
- Offsets in bounds: All offsets point within file
- Alignment correct: Structures properly aligned
- Section offsets: Metadata contains correct
pattern_section_offsetandliteral_section_offset - File size: Must be at least large enough for tree + metadata
Validation errors result in format errors. See matchy validate command for detailed validation.
Memory Mapping
The format is designed for memory mapping:
- No pointer fixups: All offsets are file-relative
- No relocations: Position-independent
- Aligned access: Natural alignment for all types
- Bounds checkable: All sizes/offsets in header
Example:
#![allow(unused)]
fn main() {
let file = File::open("database.mxy")?;
let mmap = unsafe { Mmap::map(&file)? };
// Direct access to structures
let header = read_paraglob_header(&mmap)?;
let nodes = get_node_array(&mmap, header.nodes_offset)?;
}
Cross-Platform Compatibility
Format is platform-independent:
- Endianness: Native byte order (little-endian on x86/ARM). Marker stored for future big-endian support if needed.
- Alignment: Conservative alignment for all platforms
- Sizes: Fixed-size types (
u32, notsize_t) - ABI:
#[repr(C)]structures
A database built on Linux/x86-64 works on macOS/ARM64 (both little-endian).
Future Extensions
Reserved fields for future versions:
- Pattern compilation flags (case sensitivity, etc.)
- Compressed string tables
- Alternative hash functions
- Additional data formats
Version changes will be backward-compatible when possible.
See Also
MMDB Integration
Technical reference for MaxMind DB (MMDB) compatibility layer.
Overview
Matchy provides a compatibility layer that allows existing libmaxminddb applications to use Matchy databases with minimal code changes.
Compatibility Header
#include <matchy/maxminddb.h>
Provides drop-in replacements for libmaxminddb functions.
Function Mapping
Opening Databases
| libmaxminddb | Matchy Equivalent |
|---|---|
MMDB_open() | matchy_open() |
MMDB_open_from_buffer() | matchy_open_buffer() |
MMDB_close() | matchy_close() |
Lookups
| libmaxminddb | Matchy Equivalent |
|---|---|
MMDB_lookup_string() | matchy_lookup() |
MMDB_lookup_sockaddr() | matchy_lookup_ip() |
Data Access
| libmaxminddb | Matchy Equivalent |
|---|---|
MMDB_get_value() | matchy_aget_value() |
MMDB_get_entry_data_list() | matchy_get_entry_data_list() |
Key Differences
1. Additional Features
Matchy extends MMDB with:
- Pattern matching: Glob patterns with
*and? - Exact strings: Hash-based literal matching
- Zero-copy strings: No allocation for string results
2. Error Handling
Matchy uses integer error codes:
int32_t err = matchy_lookup(db, "192.0.2.1", &result);
if (err != MATCHY_SUCCESS) {
// Handle error
}
vs. libmaxminddb status codes:
int gai_error, mmdb_error;
MMDB_lookup_result result = MMDB_lookup_string(mmdb, "192.0.2.1",
&gai_error, &mmdb_error);
3. Result Lifetime
Matchy requires explicit result freeing:
matchy_result_t *result = NULL;
matchy_lookup(db, query, &result);
if (result) {
// Use result
matchy_free_result(result); // Required!
}
4. Data Types
Matchy uses MMDB-compatible data types but with extended support:
- All MMDB types supported
- Additional types for pattern metadata
- Same binary format for compatibility
Migration Path
Quick Migration
-
Replace includes:
// Old #include <maxminddb.h> // New #include <matchy/maxminddb.h> -
Update open calls:
// Old MMDB_s mmdb; int status = MMDB_open(filename, MMDB_MODE_MMAP, &mmdb); // New matchy_t *db = matchy_open(filename); if (!db) { /* error */ } -
Update lookups:
// Old int gai_error, mmdb_error; MMDB_lookup_result result = MMDB_lookup_string(&mmdb, ip, &gai_error, &mmdb_error); // New matchy_result_t *result = NULL; int32_t err = matchy_lookup(db, ip, &result); if (err == MATCHY_SUCCESS && result) { // Use result matchy_free_result(result); }
Gradual Migration
For large codebases:
- Use both libraries side-by-side
- Migrate one component at a time
- Test thoroughly
- Switch fully when ready
Binary Compatibility
Matchy databases are forward-compatible with MMDB:
- Standard MMDB metadata section
- Compatible binary format
- PARAGLOB extensions in separate section
Existing MMDB tools can read Matchy databases (ignoring pattern data).
Performance
Matchy provides similar or better performance:
- IP lookups: Same O(n) binary trie
- Memory usage: Memory-mapped like MMDB
- Load time: <1ms for any size
- Additional: Pattern matching at no cost to IP lookups
Limitations
Not Supported
- MMDB metadata queries (use
matchy inspectinstead) - Custom memory allocators
- Legacy MMDB v1 format
Planned
- Full MMDB API compatibility shim
- Automatic format detection
- Transparent fallback to libmaxminddb
See Also
- MMDB Compatibility Guide - User guide
- Migrating from libmaxminddb - Step-by-step migration
- C API Overview - Native Matchy C API
- Binary Format - Database format specification
Matchy Database Format (.mxy)
One of Matchy’s key innovations is its hybrid database format that extends the proven MaxMind DB (MMDB) format while maintaining full backward compatibility. This means you can use Matchy as a drop-in replacement for libmaxminddb, but you also get the ability to query string literals and glob patterns in the same database—without sacrificing performance or compatibility.
If you’re familiar with MMDB files (like GeoIP databases), you already know how to use Matchy. If you need more than just IP lookups, Matchy seamlessly extends the format without breaking existing tools. It’s the best of both worlds: compatibility with the MMDB ecosystem and powerful new query capabilities.
The Matchy database format (.mxy) achieves this by extending the standard MMDB format to support IP addresses, string literals, and glob patterns in a single unified, memory-mappable database file.
Design Goals
- Backward compatibility - Read standard MMDB files without modification
- Forward compatibility - IP-only
.mxyfiles work with existing MMDB tools - Seamless extension - Add string/pattern support without breaking compatibility
- Performance - Zero overhead for standard MMDB IP lookups
- Single file - All query types in one memory-mappable database
File Structure
The .mxy format uses a dual-section approach with optional extensions:
block-beta
columns 3
block:mmdb["MMDB Section (Required)"]:3
columns 1
meta["MMDB Metadata Header"]
tree["IP Binary Trie"]
data["Shared Data Section"]
end
space:3
block:ext["Extended Section (Optional)"]:3
columns 1
magic["PARAGLOB Magic Bytes"]
strings["String Hash Index"]
patterns["Aho-Corasick Automaton"]
refs["Data References"]
end
data --> refs
style mmdb fill:#e1f5ff,stroke:#0288d1,stroke-width:2px
style ext fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
style data fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style refs fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
MMDB Section (Always Present)
The base section follows the standard MaxMind DB format:
- MMDB Metadata Header: Database configuration, record size, node count
- IP Binary Trie: Prefix tree for fast IP address lookups
- Shared Data Section: Encoded data values referenced by all query types
Extended Section (Optional)
When string or pattern matching is needed, an additional section is appended:
- PARAGLOB Magic Bytes: 8-byte identifier marking the extended section
- String Hash Index: Hash table for exact string literal matching
- Aho-Corasick Automaton: Multi-pattern matching for glob expressions
- Data References: Offsets pointing back into the shared data section
Key Innovation: Shared Data Section
The critical design element is that both sections reference the same data section:
graph LR
A[IP Lookup] --> D[Shared Data]
B[String Lookup] --> D
C[Pattern Lookup] --> D
style D fill:#c8e6c9,stroke:#388e3c,stroke-width:3px
style A fill:#e1f5ff,stroke:#0288d1,stroke-width:2px
style B fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
style C fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
This means:
- ✅ No data duplication regardless of query type
- ✅ Memory-efficient for databases with mixed query types
- ✅ Single source of truth for all metadata
- ✅ Consistent results across query methods
Compatibility Matrix
| Database Type | Matchy | libmaxminddb | Notes |
|---|---|---|---|
Standard MMDB (.mmdb) | ✅ Full | ✅ Full | Complete compatibility |
IP-only .mxy | ✅ Full | ✅ IP lookups | Extended section absent |
Full .mxy with strings/patterns | ✅ Full | ✅ IP lookups only | Extended section ignored by libmaxminddb |
Reading Standard MMDB Files
Matchy is a drop-in replacement for libmaxminddb:
#![allow(unused)]
fn main() {
// Works with any standard MMDB file
let db = Database::open("GeoLite2-City.mmdb")?;
let result = db.lookup_ip("8.8.8.8")?;
}
Writing IP-Compatible Databases
IP-only .mxy databases work with existing MMDB tools:
# Build database with Matchy
matchy build -o geoip.mxy ips.csv
# Query with libmaxminddb tools
mmdbinspect -db geoip.mxy 8.8.8.8 # Works!
# Query with Matchy for full API
matchy query geoip.mxy 8.8.8.8
Extended Databases
Databases with strings and patterns maintain IP compatibility:
# Build database with all query types
matchy build -o full.mxy \
--ips ips.csv \
--strings domains.csv \
--patterns patterns.csv
# IP lookups work with both tools
mmdbinspect -db full.mxy 1.2.3.4 # ✅ Works
matchy query full.mxy 1.2.3.4 # ✅ Works
# String/pattern lookups only work with Matchy
matchy query full.mxy "example.com" # ✅ Works
matchy query full.mxy "*.example.com" # ✅ Works
Implementation Details
Format Detection Algorithm
Matchy automatically detects the database format on opening:
flowchart TD
A[Open File] --> B{"MMDB magic<br/>bytes present?"}
B -->|Yes| C[Parse MMDB Section]
B -->|No| Z[Error: Invalid Format]
C --> D{"PARAGLOB magic<br/>after MMDB?"}
D -->|Yes| E[Parse Extended Section]
D -->|No| F[IP-only Database]
E --> G[Full Database]
F --> H[Ready]
G --> H
Z --> I[Fail]
style C fill:#e1f5ff,stroke:#0288d1,stroke-width:2px
style E fill:#fff3e0,stroke:#ef6c00,stroke-width:2px
style H fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
style I fill:#ffcdd2,stroke:#c62828,stroke-width:2px
Unified API
Regardless of format, the API remains consistent:
#![allow(unused)]
fn main() {
// Single API works for all database types
let db = Database::open("database.mxy")?;
// Query based on input type
let ip_result = db.lookup("192.168.1.1")?; // IP lookup
let str_result = db.lookup("example.com")?; // String lookup
let glob_result = db.lookup("*.example.com")?; // Pattern lookup
}
Memory Mapping
Both sections are memory-mapped for zero-copy access:
- MMDB section uses standard offsets
- Extended section uses internal offsets
- All offsets validated on database open
- No runtime bounds checking overhead
Performance Impact
| Operation | IP-only .mxy | Full .mxy | Standard .mmdb |
|---|---|---|---|
| IP Lookup | Identical | Identical | Baseline |
| String Lookup | N/A | ~30-50ns | N/A |
| Pattern Lookup | N/A | ~200-500ns | N/A |
| Memory Overhead | 0% | Extended section only | Baseline |
The extended section adds zero overhead to IP lookups.
See Also
- Binary Format Details - Low-level format specification
- MMDB Quick Start - Getting started with MMDB compatibility
- System Architecture - Overall system design
- Performance Benchmarks - Detailed performance analysis
Input Formats Reference
Technical specification of supported input formats for building Matchy databases.
Overview
Matchy supports four input formats:
- Text - Simple line-based
- CSV - Comma-separated with metadata
- JSON - Structured data
- MISP - Threat intelligence format
All formats support mixing IPs, patterns, and exact strings.
Text Format
Specification
file = (entry | comment | blank)* ;
entry = ip | cidr | pattern | exact ;
comment = "#" .* "\n" ;
blank = "\n" ;
ip = ipv4 | ipv6 ;
ipv4 = digit{1,3} "." digit{1,3} "." digit{1,3} "." digit{1,3} ;
ipv6 = /* RFC 4291 IPv6 address */ ;
cidr = ip "/" digit{1,3} ;
pattern = .* ( "*" | "?" | "[" ) .* ;
exact = .* ;
Entry Classification
Entries are automatically classified:
- Contains
/→ CIDR range - Valid IPv4/IPv6 → IP address
- Contains
*,?,[→ Glob pattern - Otherwise → Exact string
Type Prefixes
Override auto-detection with explicit type prefixes:
| Prefix | Type | Example |
|---|---|---|
literal: | Exact string | literal:*.txt |
glob: | Pattern | glob:test.com |
ip: | IP/CIDR | ip:10.0.0.1 |
The prefix is automatically stripped before storage:
literal:file*.txt # Stored as exact string "file*.txt"
glob:simple.com # Stored as pattern "simple.com"
ip:192.168.1.1 # Stored as IP address 192.168.1.1
See Entry Types - Prefix Technique for details.
Examples
# IPv4 addresses
192.0.2.1
10.0.0.1
# IPv6 addresses
2001:db8::1
::1
# CIDR ranges
10.0.0.0/8
192.168.0.0/16
2001:db8::/32
# Glob patterns
*.example.com
test-*.domain.com
http://*/admin/*
[a-z]*.evil.com
# Exact strings
exact.match.com
specific-domain.com
Limitations
- No metadata support
- No per-entry JSON data
- Whitespace-only lines ignored
- UTF-8 encoding required
CLI Usage
matchy build -o output.mxy input.txt
CSV Format
Specification
file = header row* ;
header = "entry" ("," column_name)* "\n" ;
row = entry_value ("," value)* "\n" ;
Required Columns
| Column | Required | Description |
|---|---|---|
entry or key | Yes | IP, pattern, or exact string |
| Other columns | No | Converted to JSON metadata |
Data Type Mapping
| CSV Value | JSON Type |
|---|---|
"text" | String |
123 | Number |
true/false | Boolean |
| Empty | Null |
Examples
Simple CSV
entry,category,threat_level
192.0.2.1,malware,high
*.phishing.com,phishing,medium
exact.com,suspicious,low
Generates:
{
"192.0.2.1": {
"category": "malware",
"threat_level": "high"
}
}
Complex CSV
entry,type,score,tags,verified
10.0.0.1,botnet,95,"c2,trojan",true
*.evil.com,phishing,87,spam,false
CSV with Type Prefixes
entry,category,note
literal:test[1].txt,filesystem,Filename with brackets
glob:*.example.com,domain,Pattern match
ip:192.168.1.0/24,network,Private range
Quoting Rules
- Values with commas must be quoted:
"value,with,comma" - Quotes inside values:
"value with ""quote""" - Empty values allowed:
entry,,value
CLI Usage
matchy build -i csv -o output.mxy input.csv
JSON Format
Specification
// Object format (recommended)
{
"entry1": { /* metadata */ },
"entry2": { /* metadata */ },
...
}
// Array format
[
{ "entry": "entry1", /* metadata */ },
{ "entry": "entry2", /* metadata */ },
...
]
Object Format (Recommended)
Keys are entries (IPs, patterns, strings)
Values are metadata objects
{
"192.0.2.1": {
"category": "malware",
"threat_level": "high",
"first_seen": "2024-01-15",
"tags": ["botnet", "c2"]
},
"*.phishing.com": {
"category": "phishing",
"threat_level": "medium",
"verified": true
},
"10.0.0.0/8": {
"category": "internal",
"allow": true
}
}
Array Format
Each object must have entry or key field:
[
{
"entry": "192.0.2.1",
"category": "malware",
"score": 95
},
{
"entry": "*.evil.com",
"category": "phishing",
"score": 87
}
]
Array Format with Type Prefixes
[
{
"entry": "literal:file*.backup",
"category": "filesystem",
"note": "Match literal asterisk"
},
{
"entry": "glob:example.com",
"category": "domain",
"note": "Force pattern matching"
},
{
"entry": "ip:10.0.0.0/8",
"category": "network",
"note": "Explicit IP range"
}
]
Supported Types
| JSON Type | Stored As | Notes |
|---|---|---|
string | UTF-8 string | Max 64KB |
number | Float64 or Int32 | Depends on value |
boolean | Boolean | 1 byte |
null | Null marker | 1 byte |
array | Array | Nested arrays supported |
object | Map | Nested objects supported |
Nested Structures
{
"192.0.2.1": {
"threat": {
"category": "malware",
"subcategory": "trojan",
"details": {
"variant": "emotet",
"version": "3.2"
}
},
"tags": ["c2", "botnet", "high-confidence"],
"scores": {
"static": 95,
"dynamic": 87,
"reputation": 92
}
}
}
CLI Usage
matchy build -i json -o output.mxy input.json
MISP Format
Specification
Subset of MISP (Malware Information Sharing Platform) JSON format.
{
"Event": {
"Attribute": [
{
"type": "ip-dst" | "domain" | "url" | /* ... */,
"value": string,
"category": string,
"comment": string,
/* ... additional MISP fields */
}
]
}
}
Supported Attribute Types
| MISP Type | Matchy Classification |
|---|---|
ip-src, ip-dst | IP address |
ip-src|port, ip-dst|port | IP address (port ignored) |
domain, hostname | Exact string or pattern |
url | Pattern if contains wildcards |
email | Pattern if contains wildcards |
other | Auto-detect |
Example
{
"Event": {
"info": "Malware Campaign 2024-01",
"Attribute": [
{
"type": "ip-dst",
"value": "192.0.2.1",
"category": "Network activity",
"comment": "C2 server",
"to_ids": true
},
{
"type": "domain",
"value": "evil.example.com",
"category": "Network activity",
"comment": "Phishing domain"
},
{
"type": "url",
"value": "http://*/admin/config.php",
"category": "Payload delivery",
"comment": "Malicious URL pattern"
}
]
}
}
Metadata Extraction
MISP attributes are converted to Matchy metadata:
{
"misp_type": "ip-dst",
"misp_category": "Network activity",
"misp_comment": "C2 server",
"misp_to_ids": true
}
CLI Usage
matchy build -i misp -o output.mxy threat-feed.json
Format Comparison
| Feature | Text | CSV | JSON | MISP |
|---|---|---|---|---|
| Metadata | ❌ | ✅ Simple | ✅ Rich | ✅ Structured |
| Nested data | ❌ | ❌ | ✅ | ✅ |
| Arrays | ❌ | ❌ | ✅ | ✅ |
| Auto-type | ✅ | ✅ | ✅ | Partial |
| Size | Smallest | Small | Medium | Large |
| Readability | High | High | Medium | Low |
| Standard | No | RFC 4180 | RFC 8259 | MISP spec |
Auto-Detection
By Extension
| Extension | Format |
|---|---|
.txt | Text |
.csv | CSV |
.json | JSON (auto-detect object vs. array) |
.misp | MISP |
By Content
If extension unknown, inspects content:
- Starts with
{→ JSON or MISP - Starts with
[→ JSON array - Contains
,→ CSV - Otherwise → Text
Character Encoding
Requirement
All formats must be UTF-8 encoded.
Validation
- Automatic UTF-8 validation during build
- Invalid UTF-8 → build error
BOM Handling
UTF-8 BOM (Byte Order Mark) is:
- Detected and skipped
- Not required
- Not preserved in database
Size Limits
| Component | Limit | Notes |
|---|---|---|
| File size | 4GB | Total input file |
| Entry key | 64KB | Single IP/pattern/string |
| JSON value | 16MB | Per-entry metadata |
| Entries | 4B | Total entries in database |
Error Handling
Parse Errors
$ matchy build -i csv bad.csv
Error: Parse error at line 42: Unclosed quote
Encoding Errors
$ matchy build input.txt
Error: Invalid UTF-8 at byte offset 1234
Format Errors
$ matchy build -i json bad.json
Error: Expected object or array at root
Best Practices
Choose the Right Format
- Text: Simple lists without metadata
- CSV: Tabular data with simple metadata
- JSON: Rich structured metadata
- MISP: Threat intelligence feeds
Optimize for Size
- Use text format when no metadata needed
- Avoid deeply nested JSON
- Keep metadata minimal
- Compress input files (gzip)
Validate Before Building
# Validate CSV
csv-validator input.csv
# Validate JSON
jq empty input.json
# Test build
matchy build --dry-run input.json
See Also
- Input Formats Guide - User-friendly examples
- matchy build command - Build command reference
- Database Builder API - Programmatic building
- Data Types Reference - Supported data types
Schemas Reference
Built-in schemas for validating database yield values.
Overview
Matchy includes built-in schemas that define the structure of yield values for common database types. When you specify a known schema type during matchy build, yield values are validated against the schema, catching errors early.
Available Schemas
| Name | Metadata Type | Description |
|---|---|---|
threatdb | ThreatDB-v1 | Threat intelligence with MISP/STIX-compatible fields |
Using Schemas
CLI
Enable schema validation with --database-type:
# Use the short name - enables ThreatDB schema validation
matchy build --database-type threatdb threats.csv -o threats.mxy
# Custom names skip validation
matchy build --database-type "MyCompany-Intel" data.csv -o custom.mxy
When you use a known schema name like threatdb:
- Yield values are validated against the schema during build
- The canonical
database_type(ThreatDB-v1) is set in metadata - Validation errors stop the build with helpful messages
Rust API
Use DatabaseBuilderExt::with_schema() for automatic validation during database building:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, DatabaseBuilderExt, MatchMode, DataValue};
use std::collections::HashMap;
// Create builder with schema validation
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("threatdb")?;
// Entries are validated automatically
let mut data = HashMap::new();
data.insert("threat_level".to_string(), DataValue::String("high".to_string()));
data.insert("category".to_string(), DataValue::String("malware".to_string()));
data.insert("source".to_string(), DataValue::String("abuse.ch".to_string()));
builder.add_entry("1.2.3.4", data)?; // Validated!
// Invalid data fails immediately
let mut bad_data = HashMap::new();
bad_data.insert("threat_level".to_string(), DataValue::String("extreme".to_string()));
builder.add_entry("2.3.4.5", bad_data)?;
// Error: Validation error: Entry '2.3.4.5': "extreme" is not one of [...]
}
You can also query schema information directly:
#![allow(unused)]
fn main() {
use matchy::schemas::{get_schema_info, is_known_database_type};
// Check if a type has built-in validation
if is_known_database_type("threatdb") {
let info = get_schema_info("threatdb").unwrap();
println!("Canonical type: {}", info.database_type); // "ThreatDB-v1"
}
}
ThreatDB Schema
The ThreatDB schema (threatdb) is designed for threat intelligence databases, with fields compatible with MISP and STIX 2.1 concepts.
Required Fields
| Field | Type | Description |
|---|---|---|
threat_level | string | Severity: critical, high, medium, low, unknown |
category | string | Threat type (lowercase): malware, c2, phishing, etc. |
source | string | Origin feed or organization |
Optional Fields
| Field | Type | Description |
|---|---|---|
confidence | integer | Score 0-100 (STIX 2.1 compatible) |
first_seen | string | ISO 8601 datetime |
last_seen | string | ISO 8601 datetime |
description | string | Human-readable notes |
tags | array | List of strings for classification |
reference | string | URL to external documentation |
tlp | string | Traffic Light Protocol: CLEAR, GREEN, AMBER, AMBER+STRICT, RED |
indicator_type | string | What the key represents: ip-src, domain, url, sha256, etc. |
Threat Levels
| Value | MISP Equivalent | Use Case |
|---|---|---|
critical | - | Active campaigns, zero-days |
high | 1 | Known active threats |
medium | 2 | Suspicious activity |
low | 3 | Low confidence or historical |
unknown | 4 | Insufficient data |
Common Categories
malware c2 phishing botnet ransomware
spam scanner proxy cryptomining dropper
apt tor-exit vpn bruteforce exploit
rat stealer ddos
TLP (Traffic Light Protocol)
| Value | Sharing |
|---|---|
CLEAR | Unrestricted (formerly WHITE) |
GREEN | Community-wide |
AMBER | Limited distribution |
AMBER+STRICT | Organization only |
RED | Named recipients only |
Example: CSV Input
key,threat_level,category,source,confidence,tags
192.0.2.1,high,c2,abuse.ch,95,"emotet,banking"
*.evil.com,medium,phishing,internal,75,
10.0.0.0/8,low,scanner,honeypot,50,
Example: JSON Input
{
"192.0.2.1": {
"threat_level": "high",
"category": "c2",
"source": "abuse.ch",
"confidence": 95,
"first_seen": "2024-01-15T10:30:00Z",
"tags": ["emotet", "banking-trojan"],
"tlp": "AMBER"
},
"*.evil.com": {
"threat_level": "medium",
"category": "phishing",
"source": "internal",
"description": "Phishing campaign targeting employees"
}
}
Example: Build with Validation
$ matchy build --database-type threatdb -f json threats.json -o threats.mxy
Schema validation: enabled (ThreatDB-v1)
Building database from threats.json
Added 2 entries
Successfully wrote threats.mxy
Validation Errors
Invalid data produces clear error messages:
$ cat bad.csv
key,threat_level,category,source
192.0.2.1,critical,malware,abuse.ch
10.0.0.1,extreme,badcat,
$ matchy build --database-type threatdb bad.csv -o out.mxy
Schema validation failed for entry "10.0.0.1"
Validation errors:
- /threat_level: "extreme" is not one of ["critical","high","medium","low","unknown"]
- /source: string length 0 is less than minLength 1
Use a custom --database-type name if you don't want schema validation.
Validating Existing Databases
The matchy validate command checks schema compliance for databases with known database_type:
# Validates structure AND schema if database_type is "ThreatDB-v1"
matchy validate threats.mxy
Validation detects the schema from the database_type metadata field.
Custom Schemas (Future)
Currently, only built-in schemas are supported. Custom schema support via --schema <file> may be added in future versions.
For now, use a custom --database-type name to skip schema validation:
# No validation - your own structure
matchy build --database-type "MyCompany-ThreatFeed-v2" data.json -o custom.mxy
Schema API Reference
DatabaseBuilderExt Trait
The DatabaseBuilderExt trait adds schema support to DatabaseBuilder:
#![allow(unused)]
fn main() {
use matchy::{DatabaseBuilder, DatabaseBuilderExt, MatchMode};
let builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("threatdb")?;
}
with_schema(schema_name: &str) -> Result<Self, SchemaError>
Configures the builder with automatic schema validation.
- All entries are validated before insertion
- Sets
database_typemetadata automatically - Returns error if schema name is unknown
#![allow(unused)]
fn main() {
// Valid schema name
let builder = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("threatdb")?;
// Unknown schema - returns SchemaError
let result = DatabaseBuilder::new(MatchMode::CaseInsensitive)
.with_schema("unknown");
assert!(result.is_err());
}
Schema Lookup Functions
#![allow(unused)]
fn main() {
use matchy::schemas::{
get_schema_info,
schema_database_type,
detect_schema_from_database_type,
available_schemas,
is_known_database_type,
};
}
get_schema_info(name: &str) -> Option<&'static SchemaInfo>
Returns full schema metadata.
#![allow(unused)]
fn main() {
let info = get_schema_info("threatdb").unwrap();
println!("{}: {}", info.name, info.description);
// threatdb: Threat intelligence database with MISP/STIX-compatible fields
}
schema_database_type(name: &str) -> Option<&'static str>
Maps short name to canonical database_type.
#![allow(unused)]
fn main() {
assert_eq!(schema_database_type("threatdb"), Some("ThreatDB-v1"));
}
detect_schema_from_database_type(db_type: &str) -> Option<&'static str>
Maps database_type back to schema name.
#![allow(unused)]
fn main() {
assert_eq!(detect_schema_from_database_type("ThreatDB-v1"), Some("threatdb"));
}
available_schemas() -> impl Iterator<Item = &'static str>
Lists all available schema names.
#![allow(unused)]
fn main() {
for name in available_schemas() {
println!(" - {}", name);
}
}
is_known_database_type(name: &str) -> bool
Checks if a name is a known schema (short name or database_type).
#![allow(unused)]
fn main() {
assert!(is_known_database_type("threatdb"));
assert!(is_known_database_type("ThreatDB-v1"));
assert!(!is_known_database_type("Custom-Type"));
}
SchemaInfo Struct
#![allow(unused)]
fn main() {
pub struct SchemaInfo {
/// Short name used in CLI (e.g., "threatdb")
pub name: &'static str,
/// Database type string set in metadata (e.g., "ThreatDB-v1")
pub database_type: &'static str,
/// Human-readable description
pub description: &'static str,
}
}
See Also
- DatabaseBuilder - Building databases with schema validation
- matchy build - CLI building with schema validation
- matchy validate - Validating databases
- Data Types Reference - Supported yield value types
- Input Formats - CSV/JSON input format details
Performance Benchmarks
Official performance benchmarks and testing methodology for Matchy.
Overview
Matchy provides built-in benchmarking via the matchy bench command. All benchmarks use real-world data patterns and measure build time, load time, and query throughput.
Running Benchmarks
Quick Benchmark
matchy bench ip
Runs default IP benchmark (1M entries).
Custom Benchmark
matchy bench pattern --count 100000 --query-count 1000000
Benchmark Types
ip- IPv4 and IPv6 address lookupsliteral- Exact string matchingpattern- Glob pattern matchingcombined- Mixed workload (IPs + patterns)
See matchy bench command for full options.
Official Results
Generated with version 0.5.2 on Apple M-series hardware
IP Address Lookups
Configuration: 100,000 IPv4 addresses, 100,000 queries
| Metric | Value |
|---|---|
| Build time | 0.04s |
| Build rate | 2.76M IPs/sec |
| Database size | 586 KB |
| Load time | 0.54ms |
| Query throughput | 5.80M queries/sec |
| Query latency | 0.17µs |
Key characteristics:
- O(32) lookups for IPv4, O(128) for IPv6
- Binary trie traversal
- Cache-friendly sequential access
String Literal Matching
Configuration: 50,000 literal strings, 50,000 queries
| Metric | Value |
|---|---|
| Build time | 0.01s |
| Build rate | 4.03M literals/sec |
| Database size | 3.00 MB |
| Load time | 0.49ms |
| Query throughput | 4.58M queries/sec |
| Query latency | 0.22µs |
Key characteristics:
- O(1) hash table lookups
- FxHash for fast non-cryptographic hashing
- Zero-copy memory access
Pattern Matching (Globs)
Configuration: 10,000 glob patterns, 50,000 queries
| Metric | Value |
|---|---|
| Build time | 0.00s |
| Build rate | 4.08M patterns/sec |
| Database size | 62 KB |
| Load time | 0.27ms |
| Query throughput | 4.57M queries/sec |
| Query latency | 0.22µs |
Key characteristics:
- Aho-Corasick automaton
- Parallel pattern matching
- Glob wildcard support
Combined Database
Configuration: 10,000 IPs + 10,000 patterns, 50,000 queries
| Metric | Value |
|---|---|
| Build time | 0.01s |
| Build rate | 1.41M entries/sec |
| Database size | 2.29 MB |
| Load time | 0.46ms |
| Query throughput | 15.43K queries/sec |
| Query latency | 64.83µs |
Key characteristics:
- Realistic mixed workload
- Combined IP and pattern searches
- Production-like performance
Performance Factors
Database Size
| Entries | Build Time | Query Throughput |
|---|---|---|
| 10K | <0.01s | 6.5M queries/sec |
| 100K | 0.04s | 5.8M queries/sec |
| 1M | 0.35s | 5.2M queries/sec |
| 10M | 3.5s | 4.8M queries/sec |
Query performance remains high even with large databases due to memory-mapped access and efficient data structures.
Hit Rate Impact
| Hit Rate | Throughput | Notes |
|---|---|---|
| 0% | 6.2M/sec | Early termination |
| 10% | 5.8M/sec | Default benchmark |
| 50% | 5.5M/sec | Realistic workload |
| 100% | 5.0M/sec | Data extraction overhead |
Higher hit rates show slightly lower throughput due to result extraction overhead.
Trusted Mode
| Mode | Throughput | Notes |
|---|---|---|
| Safe | 4.9M/sec | UTF-8 validation |
| Trusted | 5.8M/sec | ~18% faster |
Memory Usage
Per-Database Overhead
- Handle: ~200 bytes
- File mapping: 0 bytes (OS-managed)
- Query state: 0 bytes (stack-allocated)
Sharing Between Processes
With 10 processes using 1GB database:
- Without mmap: 10 × 1GB = 10GB RAM
- With mmap: ~1GB RAM (shared pages)
Memory-mapped databases are shared between processes automatically by the OS.
Scalability
Vertical Scaling
- Single-threaded: 5.8M queries/sec
- 4 threads: 23M queries/sec (4×)
- 8 threads: 46M queries/sec (8×)
Linear scaling due to thread-safe read-only access.
Horizontal Scaling
Multiple servers can use the same database:
- NFS/shared storage: All servers access one copy
- Local copies: Each server loads independently
- Hot reload: Update without restart
Comparison to Alternatives
vs. Traditional Databases
| Feature | Matchy | PostgreSQL | Redis |
|---|---|---|---|
| IP lookups/sec | 5.8M | 50K | 200K |
| Pattern matching | Yes | Slow | No |
| Memory usage | Low (mmap) | High | High |
| Startup time | <1ms | Seconds | Seconds |
| Concurrent reads | Unlimited | Limited | Limited |
vs. In-Memory Structures
| Feature | Matchy | HashMap | Regex Set |
|---|---|---|---|
| Query speed | 5.8M/sec | 10M/sec | 10K/sec |
| Memory | O(1) | O(n) | O(n) |
| Load time | <1ms | Seconds | Seconds |
| Persistence | Built-in | Manual | Manual |
Matchy trades slight query speed for massive memory and load time advantages.
Benchmarking Methodology
Data Generation
Benchmarks use realistic synthetic data:
- IPs: Mix of /32 addresses and CIDR ranges
- Literals: Domain-like strings
- Patterns: Realistic glob patterns
Measurement
- Build time: Time to compile entries
- Save time: Disk write performance
- Load time: Memory-mapping overhead (averaged over 3 runs)
- Query time: Batch query throughput
Hardware
Official benchmarks run on:
- CPU: Apple M-series (ARM64)
- RAM: 16GB+
- Storage: SSD
Results vary by hardware but relative performance remains consistent.
Reproducing Benchmarks
Local Testing
# IP benchmark
matchy bench ip -n 100000 --query-count 100000
# Pattern benchmark
matchy bench pattern -n 10000 --query-count 50000
# Combined benchmark
matchy bench combined -n 20000 --query-count 50000
Continuous Integration
# Run benchmarks and check for regressions
matchy bench ip > results.txt
grep "QPS" results.txt
Custom Workloads
# Build your own database
matchy build -i custom.csv -o test.mxy
# Benchmark it
time matchy query test.mxy < queries.txt
Performance Tuning
For Best Query Performance
- Reuse database handles
- Use memory-mapped files (automatic)
- Keep database on fast storage (SSD)
- Use direct IP lookup when possible
For Best Build Performance
- Sort input data by type
- Use batch additions
- Pre-allocate if entry count known
- Use multiple builders in parallel
For Lowest Memory
- Use memory-mapped mode (default)
- Share databases between processes
- Close unused databases promptly
- Use validated mode (skips validation cache)
See Also
- matchy bench command - Benchmark command reference
- Performance Guide - Optimization strategies
- Architecture - Design and implementation
- Memory Management - Memory usage details
Architecture
Technical overview of Matchy’s design and implementation.
Design Goals
Matchy is built around these core principles:
- Zero-copy access - Memory-mapped files for instant loading
- Unified database - Single file for IPs, strings, and patterns
- Memory efficiency - Shared read-only pages across processes
- High performance - Millions of queries per second
- Safety first - Memory-safe Rust core with careful FFI
System Architecture
┌─────────────────────────────────────┐
│ Matchy Database │
│ (.mxy) │
└─────────────────────────────────────┘
│
├─ MMDB Section (IP lookups)
│ └─ Binary trie for CIDR matching
│
├─ Literal Hash Section
│ └─ FxHash table for exact strings
│
└─ PARAGLOB Section
├─ Aho-Corasick automaton
├─ Pattern table
└─ Data section (JSON values)
Core Components
1. Binary Trie (IP Lookups)
Purpose: Efficient CIDR prefix matching
Algorithm: Binary trie with longest-prefix-match
- Each node represents one bit in the IP address
- IPv4: Maximum 32 levels deep
- IPv6: Maximum 128 levels deep
- O(n) lookup where n = address bits
Memory layout:
Node {
left_offset: u32, // Offset to left child (0 bit)
right_offset: u32, // Offset to right child (1 bit)
data_offset: u32, // Offset to associated data
}
Performance:
- 5.8M lookups/sec for IPv4
- Cache-friendly sequential traversal
- Zero allocations per query
2. Literal Hash Table
Purpose: O(1) exact string matching
Algorithm: FxHash with open addressing
- Non-cryptographic hash for speed
- Collision resolution via linear probing
- Load factor kept below 0.75
Memory layout:
HashEntry {
hash: u64, // FxHash of the string
string_offset: u32, // Offset to string data
data_offset: u32, // Offset to associated data
}
Performance:
- 4.58M lookups/sec
- Single memory access for most queries
- Zero string allocations
3. Aho-Corasick Automaton (Pattern Matching)
Purpose: Parallel multi-pattern glob matching
Algorithm: Offset-based Aho-Corasick
- Finite state machine for pattern matching
- Failure links for efficient backtracking
- Glob wildcards:
*(any),?(single),[a-z](class)
Memory layout:
AcNode {
edges_offset: u32, // Offset to edge table
edges_count: u16, // Number of outgoing edges
failure_offset: u32, // Failure function link
pattern_ids_offset: u32,// Patterns ending here
pattern_count: u16, // Number of patterns
}
AcEdge {
character: u8, // Input character
target_offset: u32, // Target node offset
}
Performance:
- 4.57M lookups/sec
- O(n + m) where n = text length, m = pattern length
- All patterns checked in single pass
Data Flow
Query Path
┌───────────────────────────┐
│ Query (text or IP) │
└───────────┬──────────────┘
│
├─ Parse as IP?
│ ├─ Yes → Binary Trie Lookup
│ └─ No ↓
│
├─ Hash Lookup (Exact)
│ ├─ Found → Return result
│ └─ Not found ↓
│
└─ Pattern Match (Aho-Corasick)
├─ Match → Return first
└─ No match → Return NULL
Build Path
┌──────────────────────────────┐
│ Input (CSV, JSON, etc.) │
└─────────────┬────────────────┘
│
├─ Parse entries
│
├─ Categorize:
│ ├─ IP addresses → Binary trie builder
│ ├─ Exact strings → Hash table builder
│ └─ Patterns → Aho-Corasick builder
│
├─ Build data structures
│
├─ Serialize to binary
│
└─ Write .mxy file
Memory Management
Offset-Based Pointers
All internal references use file offsets instead of pointers:
#![allow(unused)]
fn main() {
// NOT this:
struct Node {
left: *const Node, // Pointer (can't mmap)
}
// But this:
struct Node {
left_offset: u32, // Offset (mmap-friendly)
}
}
Benefits:
- Memory-mappable
- Cross-process safe
- Platform-independent
Memory Layout
┌─────────────────────────────────────┐ ← File start (offset 0)
│ MMDB Metadata (128 bytes) │
├─────────────────────────────────────┤
│ IP Binary Trie │
│ (variable size) │
├─────────────────────────────────────┤
│ Data Section │
│ (JSON values, strings) │
├─────────────────────────────────────┤
│ "PARAGLOB" Magic (8 bytes) │
├─────────────────────────────────────┤
│ PARAGLOB Header │
│ - Node count │
│ - Pattern count │
│ - Offsets to sections │
├─────────────────────────────────────┤
│ AC Automaton Nodes │
├─────────────────────────────────────┤
│ AC Edges │
├─────────────────────────────────────┤
│ Pattern Table │
├─────────────────────────────────────┤
│ Literal Hash Table │
└─────────────────────────────────────┘ ← File end
Thread Safety
Read-Only Operations
Thread-safe:
- Opening databases
- Querying (concurrent reads)
- Inspecting metadata
Multiple threads can safely query the same database:
#![allow(unused)]
fn main() {
// Thread 1
db.lookup("query1")?;
// Thread 2 (safe!)
db.lookup("query2")?;
}
Write Operations
Not thread-safe:
- Building databases (use one builder per thread)
- Modifying entries (immutable after build)
Performance Characteristics
Time Complexity
| Operation | Complexity | Notes |
|---|---|---|
| IP lookup | O(n) | n = address bits (32 or 128) |
| Literal lookup | O(1) | Average case with FxHash |
| Pattern match | O(n+m) | n = text length, m = pattern length |
| Database load | O(1) | Memory-map operation |
| Database build | O(n log n) | n = number of entries |
Space Complexity
| Component | Space | Notes |
|---|---|---|
| Binary trie | O(n) | n = unique IP prefixes |
| Hash table | O(n) | n = literal strings |
| AC automaton | O(m) | m = total pattern characters |
| Data section | O(d) | d = JSON data size |
Optimizations
1. Memory Mapping
- Zero-copy file access
- Shared pages between processes
- OS-managed caching
- Instant “load” time
2. Offset Compression
Where possible, use smaller integer types:
u16for small offsets (<65K)u32for medium offsets (<4GB)- Reduces memory footprint
3. Cache Locality
Data structures optimized for sequential access:
- Nodes stored contiguously
- Edges grouped by source node
- Hot paths use adjacent memory
4. Zero Allocations
Query path allocates zero heap memory:
- Stack-allocated state
- Borrowed references
- No string copies
Safety
Rust Core
Core algorithms in 100% safe Rust:
- No unsafe blocks in hot paths
- Borrow checker prevents use-after-free
- Bounds checking on all array access
FFI Boundary
Unsafe code limited to C FFI:
#![allow(unused)]
fn main() {
// Validation at boundary
if ptr.is_null() {
return ERROR_INVALID_PARAM;
}
// Panic catching
let result = std::panic::catch_unwind(|| {
// ... safe Rust code ...
});
}
Validation
Multi-level validation:
- Format validation: Check magic bytes, version
- Bounds checking: All offsets within file
- UTF-8 validation: All strings valid UTF-8
- Graph validation: No cycles in automaton
See Also
- Binary Format - Detailed format specification
- Performance Benchmarks - Performance data
- C API Design - FFI safety patterns
- Database Builder - Build process details
CLI Commands
This section documents the Matchy command-line interface.
Commands
- matchy — The Matchy command-line tool
- matchy build — Build a database from input files
- matchy query — Query a database
- matchy match — Scan log files for threats by matching against a database
- matchy extract — Extract patterns (domains, IPs, emails) from log files
- matchy inspect — Inspect database contents and structure
- matchy validate — Validate database safety and correctness
- matchy bench — Benchmark database query performance
matchy
The Matchy command-line interface.
Synopsis
matchy <COMMAND> [OPTIONS]
Description
Matchy is a command-line tool for building and querying databases of IP addresses, CIDR ranges, exact strings, and glob patterns.
Commands
matchy build
Build a database from input files.
$ matchy build threats.csv -o threats.mxy
See matchy build for details.
matchy query
Query a database for matches.
$ matchy query threats.mxy 192.0.2.1
See matchy query for details.
matchy inspect
Inspect database contents and structure.
$ matchy inspect threats.mxy
See matchy inspect for details.
matchy bench
Benchmark database query performance.
$ matchy bench threats.mxy
See matchy bench for details.
Global Options
-h, --help
Print help information for matchy or a specific command.
$ matchy --help
$ matchy build --help
-V, --version
Print version information.
$ matchy --version
matchy 2.0.0
Examples
Complete Workflow
# 1. Build database
$ matchy build threats.csv -o threats.mxy
# 2. Inspect it
$ matchy inspect threats.mxy
# 3. Query it
$ matchy query threats.mxy 192.0.2.1
# 4. Benchmark it
$ matchy bench threats.mxy
Working with GeoIP
# Query a MaxMind GeoLite2 database
$ matchy query GeoLite2-City.mmdb 8.8.8.8
# Inspect it
$ matchy inspect GeoLite2-City.mmdb
Environment Variables
MATCHY_LOG
Set log level: error, warn, info, debug, trace
$ MATCHY_LOG=debug matchy build data.csv -o db.mxy
Exit Status
0- Success1- Error
Files
Matchy databases typically use the .mxy extension, though any extension works.
Standard MMDB files use .mmdb.
See Also
- Getting Started with CLI - CLI tutorial
- CLI Commands - All commands
- Matchy Guide - Conceptual documentation
matchy build
Build a database from input files.
Synopsis
matchy build [OPTIONS] <INPUT> --output <OUTPUT>
Description
The matchy build command reads entries from input files and builds an optimized
binary database. The input can be CSV, JSON, JSONL, or TSV format.
Options
-o, --output <FILE>
Specify the output database file path.
$ matchy build threats.csv -o threats.mxy
--case-sensitive
Use case-sensitive string matching. By default, matching is case-insensitive.
$ matchy build domains.csv -o domains.mxy --case-sensitive
--format <FORMAT>
Explicitly specify input format: csv, json, jsonl, or tsv. If not specified,
format is detected from file extension.
$ matchy build data.txt --format csv -o output.mxy
-t, --database-type <NAME>
Set the database type in metadata. If you use a known schema name (e.g., threatdb),
yield values are validated against the schema during build.
# Enable ThreatDB schema validation
$ matchy build threats.csv -o threats.mxy --database-type threatdb
# Custom type (no validation)
$ matchy build data.csv -o data.mxy --database-type "MyCompany-Intel"
See Schemas Reference for available schemas and validation details.
Examples
Build from CSV
$ cat threats.csv
key,threat_level,category
192.0.2.1,high,malware
10.0.0.0/8,medium,internal
*.evil.com,high,phishing
$ matchy build threats.csv -o threats.mxy
Building database from threats.csv
Added 3 entries
Successfully wrote threats.mxy
Build from JSON Lines
$ cat data.jsonl
{"key": "192.0.2.1", "threat": "high"}
{"key": "*.malware.com", "category": "malware"}
$ matchy build data.jsonl -o database.mxy
Entry Type Detection
Matchy automatically detects entry types from the key format:
| Input | Detected As |
|---|---|
192.0.2.1 | IP Address |
10.0.0.0/8 | CIDR Range |
*.example.com | Pattern (glob) |
example.com | Exact String |
Explicit Type Control
Use type prefixes to override auto-detection:
$ cat entries.txt
literal:*.not-a-glob.txt
glob:simple-string.com
ip:192.168.1.1
$ matchy build entries.txt -o output.mxy
| Prefix | Type | Example |
|---|---|---|
literal: | Exact String | literal:file*.txt matches only “file*.txt” |
glob: | Pattern | glob:test.com treated as pattern |
ip: | IP/CIDR | ip:10.0.0.1 forced as IP |
The prefix is automatically stripped before storage. This is useful when:
- String contains
*,?, or[that should be literal - Forcing pattern matching for consistency
- Disambiguating edge cases
See Entry Types - Prefix Technique for complete documentation.
See Also
- matchy query - Query databases
- matchy inspect - Inspect database contents
- First Database with CLI - Tutorial
matchy query
Query a database for matches.
Synopsis
matchy query <DATABASE> <QUERY>
Description
The matchy query command searches a database for entries matching the query string.
Arguments
<DATABASE>
Path to the database file to query.
<QUERY>
The string to search for. Can be an IP address, domain, or any string.
Examples
Query an IP Address
$ matchy query threats.mxy 192.0.2.1
Found: IP address 192.0.2.1/32
threat_level: "high"
category: "malware"
Query a CIDR Range
$ matchy query threats.mxy 10.5.5.5
Found: IP address 10.5.5.5 (matched 10.0.0.0/8)
threat_level: "medium"
category: "internal"
Query a Pattern
$ matchy query threats.mxy phishing.evil.com
Found: Pattern match
Matched patterns: *.evil.com
threat_level: "high"
category: "phishing"
Query an Exact String
$ matchy query threats.mxy evil.com
Found: Exact string match
threat_level: "critical"
No Match
$ matchy query threats.mxy safe.com
Not found
Output Format
The output shows:
- Match type (IP, CIDR, pattern, exact string)
- Matched entry details
- Associated data fields
Exit Status
0- Match found1- No match or error
See Also
- matchy build - Build databases
- matchy inspect - Inspect databases
- Entry Types - Understanding matches
matchy match
Scan log files or streams for threats by matching against a database.
Synopsis
matchy match [OPTIONS] <DATABASE> <INPUT>...
Description
The matchy match command processes log files or stdin, automatically extracting IP addresses, domains, and email addresses from each line and checking them against the database. This is designed for operational testing and real-time threat detection in log streams.
Key features:
- Automatic extraction of IPs, domains, and emails from unstructured logs
- SIMD-accelerated scanning (200-500 MB/sec typical throughput)
- Outputs JSON (NDJSON format) to stdout for easy parsing
- Statistics and diagnostics to stderr
- Memory-efficient streaming processing
Arguments
<DATABASE>
Path to the database file to query. Supports:
.mxyfiles - Pre-built matchy database (fastest, recommended for production).jsonfiles - JSON source file (auto-built in memory).csvfiles - CSV source file (auto-built in memory)
When a JSON or CSV file is provided, matchy automatically builds the database in-memory before matching. This is convenient for quick testing and ad-hoc analysis, but pre-building with matchy build is recommended for repeated use.
<INPUT>...
One or more input files containing log data (one line per entry), or - for stdin.
Multiple files can be processed sequentially or in parallel (see -j, --threads).
Options
-j, --threads <THREADS>
Number of worker threads for parallel processing (default: auto-detect).
autoor0- Use all available CPU cores (default)1- Sequential processing (single-threaded)N- Use N worker threads
$ matchy match threats.mxy *.log -j auto # Parallel (all cores)
$ matchy match threats.mxy *.log -j 4 # Parallel (4 threads)
$ matchy match threats.mxy *.log -j 1 # Sequential
Parallel processing benefits:
- 2-8x faster throughput on multi-core systems
- Better CPU utilization for I/O-bound workloads
- Scales with number of CPU cores
- Each worker has its own LRU cache
When to use sequential mode (-j 1):
- Single small file
- When output order matters
- Debugging/testing
-f, --follow
Follow log file(s) for new data (like tail -f).
Watches input files for new content and processes lines as they are appended. Press Ctrl+C to stop.
$ matchy match threats.mxy /var/log/app.log -f --stats
[INFO] Mode: Follow (watch files for new content)
...
Follow mode features:
- Monitors files for changes using file system notifications
- Processes new lines immediately as they are written
- Supports multiple files simultaneously
- Works with parallel processing (
-jflag) - Graceful shutdown on Ctrl+C
--batch-bytes <SIZE>
Batch size in bytes for parallel mode (default: 131072 = 128KB).
Controls how input is divided among worker threads. Larger batches reduce overhead but increase memory usage.
$ matchy match threats.mxy huge.log -j auto --batch-bytes 262144 # 256KB batches
--format <FORMAT>
Output format (default: json):
json- NDJSON format (one JSON object per match on stdout)summary- Statistics only (no match output)
$ matchy match threats.mxy access.log --format json
$ matchy match threats.mxy access.log --format summary --stats
-s, --stats
Show detailed statistics to stderr including:
- Processing mode (sequential/parallel/follow)
- Lines processed and match rate
- Candidate extraction breakdown (IPv4, IPv6, domains, emails)
- Throughput (MB/s)
- Timing samples (extraction and lookup)
- Cache hit rate
- Number of files processed (in multi-file mode)
$ matchy match threats.mxy access.log --stats
-p, --progress
Show live progress updates during processing.
Displays a live 3-line progress indicator showing:
- Lines processed, matches found, hit rate, bytes processed, throughput, elapsed time
- Candidate breakdown (IPv4, IPv6, domains, emails)
- Lookup query rate
On TTY (terminal), progress updates in place. On non-TTY (redirected stderr), prints periodic snapshots.
$ matchy match threats.mxy huge.log -j auto --progress
[PROGRESS] Lines: 1,234,567 | Matches: 4,523 (0.4%) | Processed: 512 MB | Throughput: 450 MB/s | Time: 1.1s
Candidates: 1,456,789 total (IPv4: 1,234,567, IPv6: 123, Domains: 234,567, Emails: 12,345)
Lookup rate: 1,324.35K queries/sec
--cache-size <SIZE>
Set LRU cache capacity for query results (default: 10000). Use 0 to disable caching.
$ matchy match threats.mxy access.log --cache-size 50000
$ matchy match threats.mxy access.log --cache-size 0 # No cache
Examples
Scan Apache Access Log
$ matchy match threats.mxy /var/log/apache2/access.log --stats
[INFO] Loaded database: threats.mxy
[INFO] Load time: 12.45ms
[INFO] Cache: 10000 entries
[INFO] Extractor configured for: IPs, strings
[INFO] Processing stdin...
{"timestamp":"1697500800.123","line_number":1,"matched_text":"192.0.2.1","input_line":"192.0.2.1 - - [17/Oct/2024:10:00:00 +0000] \"GET /login HTTP/1.1\" 200 1234","match_type":"ip","prefix_len":32,"cidr":"192.0.2.1/32","data":{"threat_level":"high","category":"malware"}}
{"timestamp":"1697500800.456","line_number":5,"matched_text":"evil.com","input_line":"Request from evil.com blocked","match_type":"pattern","pattern_count":1,"data":[{"threat_level":"critical"}]}
[INFO] Processing complete
[INFO] Lines processed: 15,234
[INFO] Lines with matches: 127 (0.8%)
[INFO] Total matches: 145
[INFO] Candidates tested: 18,456
[INFO] IPv4: 15,234
[INFO] Domains: 3,222
[INFO] Throughput: 450.23 MB/s
[INFO] Total time: 0.15s
[INFO] Cache: 10,000 entries (92.3% hit rate)
Process stdin Stream
$ tail -f /var/log/syslog | matchy match threats.mxy - --stats
Parallel Processing (Multiple Files)
$ matchy match threats.mxy /var/log/*.log -j auto --stats --progress
[INFO] Mode: Parallel (8 worker threads)
[INFO] Batch size: 131072 bytes
[INFO] Loaded database: threats.mxy
[INFO] Load time: 12.45ms
[INFO] Cache: 10000 entries per worker
[PROGRESS] Lines: 5,234,123 | Matches: 8,456 (0.2%) | Processed: 2.1 GB | Throughput: 820 MB/s | Time: 12.3s
Candidates: 6,123,456 (IPv4: 5,000,000, IPv6: 234, Domains: 1,123,222, Emails: 0)
Lookup rate: 497.85K queries/sec
[INFO] === Processing Complete ===
[INFO] Files processed: 47
[INFO] Lines processed: 5,234,123
[INFO] Lines with matches: 8,456 (0.2%)
[INFO] Throughput: 820.15 MB/s
[INFO] Total time: 12.34s
Follow Mode (Log Tailing)
$ matchy match threats.mxy /var/log/app.log -f --stats
[INFO] Mode: Follow (watch files for new content)
[INFO] Loaded database: threats.mxy
[INFO] Extractor configured for: IPs, strings
[INFO] Watching for changes... (Ctrl+C to stop)
{"timestamp":"1697500850.123","line_number":42,"matched_text":"malware.com", ...}
{"timestamp":"1697500851.456","line_number":43,"matched_text":"192.0.2.50", ...}
^C
[INFO] Shutting down...
[INFO] Lines processed: 89
[INFO] Lines with matches: 2 (2.2%)
Parallel Follow Mode (Multiple Log Files)
$ matchy match threats.mxy /var/log/app*.log -f -j 4 --stats
[INFO] Mode: Follow (watch files for new content)
[INFO] Using parallel follow with 4 worker threads
...
Quick Testing with Source Files (Auto-Build)
Skip the build step for quick ad-hoc analysis:
# JSON source file (builds database in-memory automatically)
$ cat threats.json
[
{"key": "192.168.1.0/24", "data": {"type": "internal"}},
{"key": "*.malware.com", "data": {"severity": "high"}},
{"key": "evil.example.com", "data": {"category": "phishing"}}
]
$ matchy match threats.json access.log --stats
[INFO] Building database from JSON file...
[INFO] Loaded 3 entries from JSON
[INFO] Database: 1 IPs, 1 literals, 1 globs
[INFO] Built database from: threats.json
{"matched_text":"192.168.1.50","match_type":"ip",...}
# CSV source file
$ cat threats.csv
key,type,severity
192.168.1.0/24,internal,low
*.malware.com,malware,high
$ matchy match threats.csv access.log
Note: Auto-building is convenient for testing, but pre-building with
matchy buildis faster for repeated use since it avoids rebuilding on every invocation.
Extract Only Matches
$ matchy match threats.mxy access.log | jq -r '.matched_text'
192.0.2.1
evil.com
phishing.example.com
Count Matches by Type
$ matchy match threats.mxy access.log | jq -r '.match_type' | sort | uniq -c
89 ip
38 pattern
Output Format
JSON Output (NDJSON)
Each match is a JSON object on a single line:
{
"timestamp": "1697500800.123",
"line_number": 42,
"matched_text": "192.0.2.1",
"input_line": "Original log line containing the match...",
"match_type": "ip",
"prefix_len": 24,
"cidr": "192.0.2.0/24",
"data": {
"threat_level": "high",
"category": "malware"
}
}
For pattern matches:
{
"timestamp": "1697500800.456",
"line_number": 127,
"matched_text": "evil.example.com",
"input_line": "DNS query for evil.example.com",
"match_type": "pattern",
"pattern_count": 2,
"data": [
{"threat_level": "high"},
{"category": "phishing"}
]
}
Field Reference
| Field | Type | Description |
|---|---|---|
timestamp | string | Unix timestamp with milliseconds |
line_number | number | Line number in input file |
matched_text | string | The extracted text that matched |
input_line | string | Complete original log line |
match_type | string | "ip" or "pattern" |
prefix_len | number | IP: CIDR prefix length |
cidr | string | IP: Canonical CIDR notation |
pattern_count | number | Pattern: Number of patterns matched |
data | object/array | Associated metadata from database |
Pattern Extraction
The command automatically extracts and tests:
- IPv4 addresses: 192.0.2.1, 10.0.0.0
- IPv6 addresses: 2001:db8::1, ::ffff:192.0.2.1
- Domain names: example.com, sub.domain.com
- Email addresses: user@example.com
Extraction is context-aware with word boundaries and validates format (TLD checks for domains, etc.).
Performance
Typical throughput:
- Sequential mode: 200-500 MB/s on modern hardware
- Parallel mode: 400-2000 MB/s depending on core count and workload
Parallel performance scaling:
- 2 cores: ~1.8x speedup
- 4 cores: ~3.2x speedup
- 8 cores: ~5.5x speedup
- 16+ cores: ~8-10x speedup (diminishing returns)
Best practices for performance:
- Use parallel mode (
-j auto) for multiple large files - Enable caching (default) for repeated patterns
- Increase
--batch-bytesfor very large files (>1GB) - Use sequential mode for small files (<10MB total)
Exit Status
0- Success (even if no matches found)1- Error (file not found, invalid database, etc.)
See Also
- matchy query - Single query testing
- matchy build - Build databases
- Pattern Extraction Guide - Details on extraction
- Query Result Caching - Cache optimization
matchy extract
Extract patterns (domains, IPs, emails, hashes, cryptocurrency addresses) from log files or unstructured text.
Synopsis
matchy extract [OPTIONS] <INPUT>...
Description
The matchy extract command scans log files or streams to automatically extract IP addresses, domain names, email addresses, file hashes, and cryptocurrency addresses from unstructured text. This is useful for:
- Generating threat intelligence feeds from logs
- Building input lists for
matchy build - Analyzing log data for patterns
- Pre-filtering data before database matching
Key features:
- SIMD-accelerated extraction (200-500 MB/sec typical throughput)
- Multiple output formats: JSON, CSV, plain text
- Configurable extraction types
- Unicode/IDN domain support with automatic punycode conversion
- Word boundary detection for accurate extraction
- Deduplication with
--uniqueflag
Arguments
<INPUT>...
One or more log files to process (one entry per line), or - for stdin.
$ matchy extract access.log
$ matchy extract log1.txt log2.txt log3.txt
$ cat access.log | matchy extract -
Options
--format <FORMAT>
Output format (default: json):
json- NDJSON format (one JSON object per pattern)csv- CSV format with header (type, value columns)text- Plain text (one pattern per line, no metadata)
$ matchy extract access.log --format json
{"type":"domain","value":"example.com"}
{"type":"ipv4","value":"192.0.2.1"}
$ matchy extract access.log --format csv
type,value
domain,"example.com"
ipv4,"192.0.2.1"
$ matchy extract access.log --format text
example.com
192.0.2.1
--types <TYPES>
Comma-separated extraction types (default: all):
ipv4orip4- IPv4 addresses onlyipv6orip6- IPv6 addresses onlyip- Both IPv4 and IPv6domainordomains- Domain namesemailoremails- Email addresseshashorhashes- File hashes (MD5, SHA1, SHA256, SHA384)bitcoinorbtc- Bitcoin addresses (all formats)ethereumoreth- Ethereum addressesmoneroorxmr- Monero addressescrypto- All cryptocurrency addressesall- Extract everything (default)
$ matchy extract access.log --types ipv4,domain
$ matchy extract access.log --types ip # IPv4 + IPv6
$ matchy extract access.log --types all # Everything
--min-labels <NUMBER>
Minimum number of domain labels to extract (default: 2).
$ matchy extract access.log --min-labels 2 # example.com (default)
$ matchy extract access.log --min-labels 3 # sub.example.com
This is useful to filter out bare hostnames or require fully-qualified domain names.
--no-boundaries
Disable word boundary requirements, allowing patterns to be extracted from the middle of text.
By default, extraction requires word boundaries (whitespace, punctuation) around patterns. Use this flag to extract patterns embedded in other text.
$ matchy extract access.log --no-boundaries
-u, --unique
Output only unique patterns (deduplicate across all input).
$ matchy extract access.log --unique
This maintains a hash set of seen patterns and outputs each unique pattern only once.
-s, --stats
Show extraction statistics to stderr.
$ matchy extract access.log --stats
[INFO] Extracting: IPv4, IPv6, domains, emails
[INFO] Min domain labels: 2
[INFO] Word boundaries: true
[INFO] Unique mode: false
[INFO] === Extraction Complete ===
[INFO] Lines processed: 15,234
[INFO] Patterns found: 3,456
[INFO] IPv4: 2,100
[INFO] IPv6: 23
[INFO] Domains: 1,200
[INFO] Emails: 133
[INFO] Throughput: 450.23 MB/s
[INFO] Total time: 0.15s
Statistics are always written to stderr, leaving stdout clean for piped output.
--show-candidates
Show candidate extraction details for debugging (output to stderr).
$ matchy extract access.log --show-candidates
[CANDIDATE] Domain at 45-61: example.com
[CANDIDATE] IPv4 at 0-10: 192.0.2.1
[CANDIDATE] Email at 23-42: user@example.com
Examples
Extract All Patterns (JSON)
$ matchy extract access.log
{"type":"ipv4","value":"192.0.2.1"}
{"type":"domain","value":"example.com"}
{"type":"email","value":"user@example.com"}
{"type":"ipv6","value":"2001:db8::1"}
Extract Only Domains
$ matchy extract access.log --types domain --format text
example.com
subdomain.example.org
malware.net
Build Threat Intel Database from Logs
Extract unique domains and build a database:
$ matchy extract suspicious.log \
--types domain \
--unique \
--format text \
> domains.txt
$ echo "key,threat_level" > threats.csv
$ cat domains.txt | sed 's/^/&,high/' >> threats.csv
$ matchy build threats.csv -o threats.mxy
Extract IPs with Statistics
$ matchy extract access.log --types ip --stats --unique
{"type":"ipv4","value":"192.0.2.1"}
{"type":"ipv4","value":"198.51.100.42"}
{"type":"ipv6","value":"2001:db8::1"}
[INFO] Lines processed: 10,000
[INFO] Patterns found: 2,345
[INFO] IPv4: 2,320
[INFO] IPv6: 25
[INFO] Throughput: 380.15 MB/s
[INFO] Total time: 0.08s
CSV Output for Spreadsheet Import
$ matchy extract firewall.log --format csv > patterns.csv
$ open patterns.csv # Opens in Excel/Numbers/etc.
Extract from stdin Stream
$ tail -f /var/log/syslog | matchy extract - --types domain --stats
Process Multiple Files
$ matchy extract *.log --stats --unique > all_patterns.json
Output Formats
JSON (NDJSON)
One JSON object per line with type and value:
{"type":"domain","value":"example.com"}
{"type":"ipv4","value":"192.0.2.1"}
{"type":"ipv6","value":"2001:db8::1"}
{"type":"email","value":"user@example.com"}
CSV
Header row followed by data rows:
type,value
domain,"example.com"
ipv4,"192.0.2.1"
ipv6,"2001:db8::1"
email,"user@example.com"
Values are properly escaped (quotes doubled for embedded quotes).
Text
One pattern per line, no metadata:
example.com
192.0.2.1
2001:db8::1
user@example.com
Pattern Extraction Details
IPv4 Addresses
Extracts standard IPv4 addresses: 192.0.2.1, 10.0.0.1
Validates format and rejects invalid addresses (e.g., 999.999.999.999).
IPv6 Addresses
Extracts IPv6 addresses in all standard formats:
- Full:
2001:0db8:0000:0000:0000:0000:0000:0001 - Compressed:
2001:db8::1 - IPv4-mapped:
::ffff:192.0.2.1
Domain Names
Extracts domain names with proper TLD validation:
example.comsubdomain.example.orgmulti.level.subdomain.co.uk
Unicode/IDN support: International domain names are automatically converted to punycode:
- Input:
münchen.de - Output:
xn--mnchen-3ya.de
TLD validation: Only domains with valid top-level domains are extracted (uses embedded TLD automaton with Public Suffix List data).
Email Addresses
Extracts email addresses with format validation:
user@example.comfirst.last@subdomain.example.orgadmin+tag@example.net
File Hashes
Extracts common cryptographic hashes:
- MD5: 32 hex characters (e.g.,
5d41402abc4b2a76b9719d911017c592) - SHA1: 40 hex characters (e.g.,
2fd4e1c67a2d28fced849ee1bb76e7391b93eb12) - SHA256: 64 hex characters
- SHA384: 96 hex characters
Useful for malware analysis and threat intelligence feeds.
Cryptocurrency Addresses
Extracts blockchain addresses with checksum validation:
Bitcoin (all formats):
- Legacy (P2PKH):
1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa - P2SH:
3Cbq7aT1tY8kMxWLbitaG7yT6bPbKChq64 - Bech32 (SegWit):
bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq
Ethereum:
- Format:
0x5aeda56215b167893e80b4fe645ba6d5bab767de(42 chars) - Validates EIP-55 checksum for mixed-case addresses
- Accepts all-lowercase addresses without checksum
Monero:
- Standard addresses starting with
4or8(~95 characters) - Integrated addresses (~106 characters)
Validation: All addresses are validated with cryptographic checksums:
- Bitcoin: Base58Check (double SHA256) or Bech32
- Ethereum: Keccak256-based EIP-55 checksum
- Monero: Keccak256 checksum
Useful for ransomware analysis, fraud investigation, and darknet marketplace intelligence.
Performance
Typical throughput: 200-500 MB/s on modern hardware.
Performance factors:
- Extraction types: Fewer types = faster (skip unnecessary checks)
- Word boundaries: Enabled (default) = faster (reduces false matches)
- Unique mode: Enabled = slower (hash set overhead for deduplication)
- Output format: Text = fastest, JSON = moderate, CSV = moderate
Exit Status
0- Success (even if no patterns found)1- Error (file not found, invalid arguments, etc.)
See Also
- matchy match - Match extracted patterns against database
- matchy build - Build database from extracted patterns
- Pattern Extraction Guide - Detailed extraction documentation
matchy inspect
Inspect database contents and structure.
Synopsis
matchy inspect <DATABASE>
Description
The matchy inspect command displays information about a database including size,
entry counts, and structure.
Arguments
<DATABASE>
Path to the database file to inspect.
Examples
Basic Inspection
$ matchy inspect threats.mxy
Database: threats.mxy
Size: 15,847,293 bytes (15.1 MB)
Format: Matchy Extended MMDB
Match mode: CaseInsensitive
Entry counts:
IP addresses: 1,523
CIDR ranges: 87
Exact strings: 2,341
Patterns: 8,492
Total: 12,443 entries
Performance estimates:
IP queries: ~7M/sec
Pattern queries: ~2M/sec
String queries: ~8M/sec
Large Database
$ matchy inspect large.mxy
Database: large.mxy
Size: 234,891,234 bytes (234.9 MB)
Format: Matchy Extended MMDB
Match mode: CaseInsensitive
Entry counts:
IP addresses: 85,234
CIDR ranges: 1,523
Exact strings: 42,891
Patterns: 52,341
Total: 181,989 entries
MMDB File
$ matchy inspect GeoLite2-City.mmdb
Database: GeoLite2-City.mmdb
Size: 67,234,891 bytes (67.2 MB)
Format: Standard MMDB
Match mode: N/A (IP-only database)
Entry counts:
IP addresses: ~3,000,000
CIDR ranges: Included in IP tree
Exact strings: 0
Patterns: 0
Output Information
The inspect command shows:
- File size
- Database format (MMDB or Matchy Extended)
- Match mode (case-sensitive or insensitive)
- Entry counts by type
- Performance estimates
Use Cases
Inspect is useful for:
- Verifying database contents
- Checking file size before deployment
- Estimating query performance
- Debugging database issues
Exit Status
0- Success1- Error (file not found, invalid format, etc.)
See Also
- matchy build - Build databases
- matchy bench - Benchmark performance
- Database Concepts - Understanding databases
matchy validate
Validate a database file for safety and correctness.
Synopsis
matchy validate [OPTIONS] <DATABASE>
Description
The validate command performs comprehensive validation of Matchy database files (.mxy) to ensure they are safe to load and use. This is especially important when working with databases from untrusted sources.
Validation checks include:
- MMDB format structure: Valid metadata, search tree, and data sections
- PARAGLOB section integrity: Pattern automaton structure and consistency
- Bounds checking: All offsets point within the file
- UTF-8 validity: All strings are valid UTF-8
- Graph integrity: No cycles in the failure function
- Data consistency: Arrays, maps, and pointers are valid
- Schema validation: If
database_typematches a known schema (e.g.,ThreatDB-v1), yield values are validated against it
The validator is designed to detect malformed, corrupted, or potentially malicious databases without panicking or causing undefined behavior.
Options
-l, --level <LEVEL>
Validation strictness level. Default: strict
Levels:
standard: Basic checks - offsets, UTF-8, structurestrict: Deep analysis - cycles, redundancy, consistency (default)audit: Track unsafe code paths and trust assumptions
-j, --json
Output results as JSON instead of human-readable format.
-v, --verbose
Show detailed information including warnings and info messages.
-h, --help
Print help information.
Arguments
<DATABASE>
Path to the Matchy database file (.mxy) to validate.
Examples
Basic Validation
Validate with default strict checking:
matchy validate database.mxy
Shows:
- Validation level used (strict by default)
- Database statistics (nodes, patterns, IPs, size)
- Validation time
- Pass/fail status with clear ✅/❌ indicator
Standard Validation
Use faster standard validation:
matchy validate --level standard database.mxy
Verbose Output
Show warnings and informational messages:
matchy validate --verbose database.mxy
Adds additional detail:
- Warnings: Non-fatal issues (unreferenced patterns, duplicates)
- Information: Validation steps completed successfully
- Useful for understanding what was checked and any potential optimizations
JSON Output
Machine-readable JSON format:
matchy validate --json database.mxy
Provides structured output with:
is_valid: Boolean pass/failduration_ms: Validation timeerrors,warnings,info: Categorized messagesstats: Detailed database metrics (node count, pattern count, file size, etc.)
Useful for CI/CD pipelines and automated testing.
Audit Mode
Track where unsafe code is used and what trust assumptions are made:
matchy validate --level audit --verbose database.mxy
This mode is useful for security audits and understanding the trust model.
Exit Status
- 0: Validation passed (no errors)
- 1: Validation failed (errors found)
- Other: Command error (file not found, etc.)
Validation Levels
Standard
Fast validation with essential safety checks:
- File format structure
- Offset bounds checking
- UTF-8 string validity
- Basic graph structure
Use when: Validating trusted databases for basic integrity
Strict (Default)
Comprehensive validation including:
- All standard checks
- Cycle detection in automaton
- Redundancy analysis
- Deep consistency checks
- Pattern reachability
Use when: Validating databases from untrusted sources (default)
Audit
All strict checks plus:
- Track all unsafe code locations
- Document trust assumptions
- Report where
--trustedmode bypasses validation - Security analysis
Use when: Performing security audits
Common Validation Errors
Invalid MMDB format
ERROR: Invalid MMDB format: metadata marker not found
The file is not a valid MMDB database.
Offset out of bounds
ERROR: Node 123 edge offset 45678 exceeds file size 40000
The database references data beyond the file size - likely corruption.
Invalid UTF-8
ERROR: String at offset 12345 contains invalid UTF-8
A string in the database is not valid UTF-8 text.
Cycle detected
ERROR: Cycle detected in failure function starting at node 56
The Aho-Corasick automaton has a cycle, making it unsafe to traverse.
Invalid magic bytes
ERROR: PARAGLOB section magic bytes mismatch: expected "PARAGLOB", found "CORRUPT!"
The PARAGLOB section header is corrupted.
When to Validate
Always Validate
- Databases from untrusted sources
- Databases downloaded from the internet
- Databases created by third parties
- After file transfer (detect corruption)
Optional Validation
- Databases built locally with
matchy build - Databases from trusted internal sources
- Development/testing environments
Skip Validation
- After validation has already passed
- In performance-critical hot paths
- When loading the same database repeatedly
Performance
Validation speed depends on database size and complexity. Standard mode is typically faster than strict mode.
For very large databases (>100MB), consider using --level standard for faster validation, or validate once and cache the result.
Security Considerations
The validator is designed to be safe even with malicious input:
- No panics: All errors are caught and reported
- Bounds checking: All memory access is validated
- Safe Rust: Core validation uses only safe Rust
- No trust: Assumes file contents may be adversarial
However, validation is not a substitute for other security measures:
- Always validate before first use
- Use strict mode for untrusted sources
- Combine with file integrity checks (checksums)
- Consider sandboxing if processing user-uploaded files
Integration with Other Commands
Validate After Building
matchy build -i patterns.csv -o database.mxy
matchy validate database.mxy
Validate Before Querying
matchy validate database.mxy && \
matchy query database.mxy "*.example.com"
Batch Validation
for db in *.mxy; do
echo "Validating $db..."
matchy validate --level standard "$db" || echo "FAILED: $db"
done
Troubleshooting
False Positives
Some warnings may be benign:
- Unreferenced patterns (intentional padding)
- Duplicate patterns (for testing)
Use --level standard to skip these checks if needed.
Performance Issues
For very large databases (>100MB):
- Use
--level standardfor faster validation - Validate once and cache the result
- Skip validation for trusted internal databases
Memory Usage
Validation loads the entire file into memory. For databases larger than available RAM, validation may fail with an out-of-memory error.
See Also
- matchy build - Build databases
- matchy inspect - Inspect database structure
- Validation API - Programmatic validation
- Schemas Reference - Schema validation details
- Binary Format - Format specification
matchy bench
Benchmark database performance by generating test databases and measuring build, load, and query performance.
Synopsis
matchy bench [OPTIONS] [TYPE]
Description
The matchy bench command generates synthetic test databases of various types and sizes, then benchmarks:
- Build time: How long it takes to create the database
- Load time: How long it takes to open/memory-map the database
- Query performance: Throughput and latency for lookups
This is useful for performance testing, capacity planning, and comparing different database types and configurations.
Arguments
[TYPE]
Type of database to benchmark. Default: ip
Options:
ip- IP address databasesliteral- Exact string match databasespattern- Glob pattern databasescombined- Mixed database with all entry types
matchy bench ip # Benchmark IP lookups
matchy bench pattern # Benchmark pattern matching
matchy bench combined # Benchmark mixed workload
Options
-n, --count <COUNT>
Number of entries to test with. Default: 1000000
matchy bench ip --count 100000 # Small database
matchy bench ip --count 10000000 # Large database
-o, --output <OUTPUT>
Output file for the test database. If not specified, uses a temporary file.
matchy bench pattern --output test.mxy
-k, --keep
Keep the generated database file after benchmarking (otherwise it’s deleted).
matchy bench ip --output bench.mxy --keep
--load-iterations <LOAD_ITERATIONS>
Number of load iterations to average. Default: 3
matchy bench ip --load-iterations 10
--query-count <QUERY_COUNT>
Number of queries for batch benchmark. Default: 100000
matchy bench ip --query-count 1000000 # 1M queries
--hit-rate <HIT_RATE>
Percentage of queries that should match (0-100). Default: 10
A lower hit rate tests “not found” performance, while a higher hit rate tests match performance.
matchy bench ip --hit-rate 50 # 50% of queries find matches
matchy bench ip --hit-rate 90 # 90% of queries find matches
--pattern-style <PATTERN_STYLE>
Pattern style for pattern benchmarks. Default: complex
Options:
prefix- Prefix patterns likeprefix*suffix- Suffix patterns like*.suffixmixed- Mix of prefix and suffixcomplex- Complex patterns with wildcards and character classes
matchy bench pattern --pattern-style prefix
matchy bench pattern --pattern-style complex
-h, --help
Print help information.
Examples
Basic IP Benchmark
$ matchy bench ip --count 1000
<!-- cmdrun matchy bench ip --count 1000 -->
Pattern Benchmark with Custom Settings
$ matchy bench pattern --count 500 --pattern-style prefix
<!-- cmdrun matchy bench pattern --count 500 --pattern-style prefix -->
Combined Benchmark
$ matchy bench combined --count 300
<!-- cmdrun matchy bench combined --count 300 -->
Save Benchmark Database
matchy bench ip --count 1000000 --output benchmark.mxy --keep
This creates a database you can inspect or query later:
matchy inspect benchmark.mxy
matchy query benchmark.mxy "192.0.2.1"
High Hit Rate Benchmark
matchy bench ip --hit-rate 90 --query-count 1000000
Tests performance when most queries find matches (realistic for allowlist/blocklist scenarios).
Low Hit Rate Benchmark
matchy bench ip --hit-rate 5 --query-count 1000000
Tests “not found” performance (realistic for threat intelligence databases where most IPs are not threats).
Benchmark Types
IP Benchmarks
Generates random IPv4 and IPv6 addresses:
- Mix of /32 addresses and CIDR ranges
- Realistic distribution
- Tests binary trie performance
Literal Benchmarks
Generates random strings:
- Domain-like strings (e.g.,
subdomain.example.com) - Tests hash table performance
- O(1) lookup complexity
Pattern Benchmarks
Generates glob patterns based on style:
- Prefix:
prefix*patterns - Suffix:
*.suffixpatterns - Mixed: Combination of prefix and suffix
- Complex: Wildcards, character classes
[abc], negation[!xyz]
Tests Aho-Corasick automaton performance.
Combined Benchmarks
Generates databases with all three types:
- Equal distribution (33.3% each)
- Tests mixed workload performance
- Realistic production scenario
Performance Factors
Benchmark results depend on:
Database Size
- Larger databases → slightly slower queries
- Build time scales linearly
- Load time remains constant (memory-mapped)
Entry Type
- IPs: Fastest (~7M queries/sec)
- Literals: Very fast (~8M queries/sec)
- Patterns: Moderate (~1-2M queries/sec)
Hit Rate
- High hit rate → slightly slower (data extraction overhead)
- Low hit rate → faster (early termination)
Hardware
- CPU speed affects query throughput
- RAM speed affects load performance
- Storage type affects build time
Pattern Complexity
- Simple patterns (prefix/suffix) → faster
- Complex patterns → slower
- More patterns → more states to traverse
Interpreting Results
Build Time
How long it takes to compile entries into optimized format:
- 1M entries: ~1-3 seconds (typical)
- Scales approximately linearly
- One-time cost
Load Time
How long it takes to memory-map the database:
- Should be <1ms for any size
- Instant startup time
- Memory-mapped, not loaded into RAM
Query Performance
Good performance:
- IPs: >5M queries/sec
- Literals: >6M queries/sec
- Patterns: >1M queries/sec
Acceptable performance:
- IPs: 2-5M queries/sec
- Literals: 3-6M queries/sec
- Patterns: 500k-1M queries/sec
Investigate if slower:
- Check system load
- Verify no swap usage
- Check disk I/O (shouldn’t be any after load)
Use Cases
Capacity Planning
# Test with production-sized database
matchy bench combined --count 5000000 --query-count 10000000
Use results to estimate:
- Queries your system can handle
- Memory requirements
- Build time for updates
Performance Regression Testing
# Run before changes
matchy bench pattern --count 1000000 > before.txt
# Make changes...
# Run after changes
matchy bench pattern --count 1000000 > after.txt
# Compare results
diff before.txt after.txt
Hardware Comparison
# Run same benchmark on different systems
matchy bench combined --count 1000000
Compare:
- Query throughput
- Build time
- Load time
Exit Status
- 0: Benchmark completed successfully
- 1: Error (out of memory, disk full, etc.)
See Also
- matchy build - Build production databases
- matchy validate - Validate databases
- Performance Considerations - Optimization guide
- Performance Benchmarks - Detailed performance data
Contributing
Thank you for considering contributing to Matchy!
Ways to Contribute
- Report bugs - File issues with reproduction steps
- Suggest features - Propose new capabilities
- Fix bugs - Submit pull requests
- Add tests - Improve test coverage
- Improve docs - Enhance documentation
- Optimize code - Performance improvements
Getting Started
- Fork the repository on GitHub
- Clone your fork:
git clone https://github.com/YOUR_USERNAME/matchy.git cd matchy - Create a branch:
git checkout -b feature/my-feature - Make your changes
- Test thoroughly:
cargo test cargo clippy cargo fmt - Commit with clear messages:
git commit -m "Add feature: description" - Push and create a pull request
Development Guidelines
Code Style
- Run
cargo fmtbefore committing - Fix clippy warnings with
cargo clippy - Use descriptive names for functions and variables
- Add doc comments (
///) for public APIs - Keep functions focused - one responsibility per function
Testing
- Write tests for new features
- Maintain coverage - aim for high test coverage
- Test edge cases - empty inputs, large inputs, invalid data
- Use descriptive test names -
test_glob_matches_wildcard
#![allow(unused)]
fn main() {
#[test]
fn test_ip_lookup_finds_exact_match() {
let db = build_test_database();
let result = db.lookup("1.2.3.4").unwrap();
assert!(result.is_some());
}
}
Documentation
- Document public APIs with
///comments - Include examples in doc comments
- Update mdBook docs for user-facing changes
- Keep README current
#![allow(unused)]
fn main() {
/// Lookup an entry in the database
///
/// # Examples
///
/// ```
/// let db = Database::open("db.mxy")?;
/// let result = db.lookup("1.2.3.4")?;
/// ```
pub fn lookup(&self, query: &str) -> Result<Option<QueryResult>> {
// ...
}
}
Commit Messages
Use clear, descriptive commit messages:
Add: Brief description of what was added
Fix: Brief description of what was fixed
Docs: Brief description of documentation changes
Test: Brief description of test changes
Perf: Brief description of performance improvements
Pull Request Process
- Update tests - Add/update tests for your changes
- Update docs - Update relevant documentation
- Run CI checks locally:
cargo test cargo clippy -- -D warnings cargo fmt -- --check - Write clear PR description - Explain what and why
- Link related issues - Reference any related issues
- Be responsive - Address review feedback promptly
Code of Conduct
- Be respectful - Treat everyone with respect
- Be constructive - Provide helpful feedback
- Be patient - Maintainers are often volunteers
- Be collaborative - Work together towards solutions
Questions?
Feel free to:
- Open an issue for questions
- Start a discussion for brainstorming
- Check existing docs for answers
Thank you for contributing! 🎉
Building from Source
Build Matchy from source code.
Prerequisites
- Rust 1.70 or later
- C compiler (for examples)
Quick Build
# Clone
git clone https://github.com/matchylabs/matchy.git
cd matchy
# Build
cargo build --release
# Test
cargo test
# Install CLI
cargo install --path .
Build Profiles
Debug Build
cargo build
# Output: target/debug/
- Fast compilation
- Includes debug symbols
- No optimizations
Release Build
cargo build --release
# Output: target/release/
- Slow compilation
- Full optimizations
- LTO enabled
- Single codegen unit
Build Options
# Check without building
cargo check
# Build with all features
cargo build --all-features
# Build examples
cargo build --examples
# Build documentation
cargo doc --no-deps
C Header Generation
The C header is auto-generated on release builds:
cargo build --release
# Generates: include/matchy.h
Cross-Compilation
# Install target
rustup target add x86_64-unknown-linux-gnu
# Build for target
cargo build --release --target x86_64-unknown-linux-gnu
Development Tools
Matchy includes development-only tools in the examples/ directory that are not installed with cargo install.
Updating the Public Suffix List
The TLD matching feature uses a hash-based lookup table built from the Public Suffix List. To refresh this data:
# Download latest PSL and generate punycode versions
cd tools/update-psl
cargo run
# Verify everything works
cd ../..
cargo test
# Commit the updated data
git add src/data/public_suffix_list.dat
git commit -m "Update Public Suffix List"
The update tool:
- Downloads the latest PSL from publicsuffix.org
- Generates punycode versions of non-ASCII entries (e.g., “公司.cn” → “xn–55qx5d.cn”)
- Saves both UTF-8 and punycode versions to
src/data/public_suffix_list.dat - This ensures domains work whether logs contain UTF-8 or punycode
Note: This is only needed when updating TLD patterns. The PSL data is embedded at compile time, so end users never need to run this.
See Also
Testing
Comprehensive testing guide for Matchy.
Running Tests
# Run all tests
cargo test
# Run with output
cargo test -- --nocapture
# Run specific test
cargo test test_glob_matching
# Run integration tests
cargo test --test integration_tests
# Run with backtrace
RUST_BACKTRACE=1 cargo test
Test Categories
Unit Tests
In module files alongside code:
#![allow(unused)]
fn main() {
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ip_lookup() {
let db = build_test_db();
let result = db.lookup("1.2.3.4").unwrap();
assert!(result.is_some());
}
}
}
Integration Tests
In tests/ directory:
#![allow(unused)]
fn main() {
// tests/integration_tests.rs
use matchy::*;
#[test]
fn test_end_to_end_workflow() {
// Build database
let mut builder = MmdbBuilder::new(MatchMode::CaseSensitive);
builder.add_ip("1.2.3.4", HashMap::new()).unwrap();
let bytes = builder.build().unwrap();
// Save and load
std::fs::write("test.mxy", &bytes).unwrap();
let db = Database::open("test.mxy").unwrap();
// Query
let result = db.lookup("1.2.3.4").unwrap();
assert!(result.is_some());
}
}
Benchmark Tests
cargo bench
Test Patterns
Setup and Teardown
#![allow(unused)]
fn main() {
fn setup() -> Database {
let mut builder = MmdbBuilder::new(MatchMode::CaseSensitive);
builder.add_ip("1.2.3.4", HashMap::new()).unwrap();
let bytes = builder.build().unwrap();
std::fs::write("test.mxy", &bytes).unwrap();
Database::open("test.mxy").unwrap()
}
#[test]
fn test_query() {
let db = setup();
// test...
}
}
Testing Errors
#![allow(unused)]
fn main() {
#[test]
fn test_invalid_ip() {
let db = setup();
let result = db.lookup("invalid");
assert!(result.is_err());
}
}
Coverage
# Install tarpaulin
cargo install cargo-tarpaulin
# Generate coverage
cargo tarpaulin --out Html
See Also
Benchmarking
Performance benchmarking for Matchy.
Running Benchmarks
# Run all benchmarks
cargo bench
# Run specific benchmark
cargo bench pattern_matching
# Save baseline
cargo bench --bench matchy_bench -- --save-baseline main
# Compare to baseline
cargo bench --bench matchy_bench -- --baseline main
Benchmark Categories
- IP lookups - Binary trie performance
- Literal matching - Hash table performance
- Pattern matching - Aho-Corasick performance
- Database building - Construction time
- Database loading - mmap overhead
CLI Benchmarking
# Benchmark IP lookups
matchy bench ip --count 100000
# Benchmark pattern matching
matchy bench pattern --count 50000
# Benchmark combined
matchy bench combined --count 100000
Memory Profiling
Matchy includes tools for analyzing memory allocations during queries.
Query Allocation Profiling
Use the query_profile tool to analyze query-time allocations:
# Run with memory profiling enabled
cargo bench --bench query_profile --features dhat-heap
# Output shows allocation statistics
Completed 1,000,000 queries
=== Query-Only Memory Profile ===
dhat: Total: 8,000,894 bytes in 1,000,014 blocks
dhat: At t-gmax: 753 bytes in 11 blocks
dhat: At t-end: 622 bytes in 10 blocks
dhat: Results saved to: dhat-heap.json
This runs 1 million queries and tracks every allocation.
Interpreting Results
Key metrics:
- Total bytes: All allocations during profiling period
- Total blocks: Number of separate allocations
- t-gmax: Peak heap usage (maximum resident memory)
- t-end: Memory still allocated at program end
What to Look For
Good results (current state):
Total: ~8MB in 1M blocks
- ~1 allocation per query: Only the return Vec is allocated
- ~8 bytes per allocation: Just the Vec header
- Internal buffers are reused across queries
Bad results (if you see this, something regressed):
Total: ~50MB in 5M blocks
- 5+ allocations per query: Temporary buffers not reused
- 50+ bytes per allocation: Excessive copying
- Performance will be degraded
Viewing Detailed Results
The tool generates dhat-heap.json which can be viewed with dhat’s viewer:
# Open in browser (requires dhat repository)
open dhat/dh_view.html
# Then drag and drop dhat-heap.json into the viewer
The viewer shows:
- Allocation call stacks
- Peak memory usage over time
- Hotspots (which code allocates most)
Why This Matters
Query performance is critical. Matchy achieves:
- ~7M queries/second for IP lookups
- ~2M queries/second for pattern matching
This is only possible through careful allocation management:
- Buffer reuse: Internal buffers are reused across queries
- Zero-copy patterns: Data is read directly from mmap’d memory
- Minimal cloning: Only the final result Vec is allocated
Each allocation costs ~100ns, so avoiding them matters.
Allocation Optimization History
Matchy underwent allocation optimization in October 2024:
Before optimization:
- 4 allocations per query (~10.4 bytes each)
- ~40MB allocated per 1M queries
- Short-lived temporary vectors
After optimization:
- 1 allocation per query (~8 bytes)
- ~8MB allocated per 1M queries
- 75% reduction in allocations
Key changes:
- Added
result_bufferto reuse across queries - Changed
lookup_into()to write into caller’s buffer - Preserved buffer capacity across
clear()calls
CPU Profiling
Flamegraphs
Visualize where time is spent:
# Install flamegraph
cargo install flamegraph
# Generate flamegraph
sudo cargo flamegraph --bench matchy_bench
# Opens: flamegraph.svg
Flamegraphs show:
- Which functions take the most time (wider = more time)
- Call stack relationships (parent/child)
- Hot paths through your code
Perf on Linux
# Record performance data
perf record --call-graph dwarf cargo bench
# View report
perf report
Instruments on macOS
# Build with debug symbols
cargo build --release
# Profile with Instruments
xcrun xctrace record --template 'Time Profiler' \
--output profile.trace \
--launch target/release/matchy bench database.mxy
# Open in Instruments
open profile.trace
Performance Testing Workflow
When optimizing:
-
Establish baseline:
cargo bench -- --save-baseline before -
Make changes
-
Compare results:
cargo bench -- --baseline before -
Profile allocations:
cargo bench --bench query_profile --features dhat-heap -
Profile CPU (if needed):
sudo cargo flamegraph --bench matchy_bench -
Validate improvements:
- Check allocation counts didn’t increase
- Verify throughput improved (or stayed same)
- Run full test suite:
cargo test
See Also
- Performance Guide - Performance characteristics
- CLI Bench Command - Command-line benchmarking
- Testing - Correctness testing
Fuzzing Guide
Fuzz testing for Matchy.
Setup
# Install cargo-fuzz
cargo install cargo-fuzz
# Initialize fuzzing
cargo fuzz init
Running Fuzzers
# List fuzz targets
cargo fuzz list
# Run specific target
cargo fuzz run fuzz_glob_matching
# Run with jobs
cargo fuzz run fuzz_glob_matching -- -jobs=4
Fuzz Targets
See Fuzz Targets for details.
Corpus Management
# Add to corpus
echo "test input" > fuzz/corpus/fuzz_target/input
# Minimize corpus
cargo fuzz cmin fuzz_target
See Also
CI/CD Checks
Continuous integration checks for Matchy.
Local Checks
Run before committing:
# Run all checks
cargo test
cargo clippy -- -D warnings
cargo fmt -- --check
CI Pipeline
Automated checks on pull requests:
Tests
cargo test --all-features
cargo test --no-default-features
Lints
cargo clippy -- -D warnings
Format
cargo fmt -- --check
Documentation
cargo doc --no-deps
Pre-commit Hook
#!/bin/bash
# .git/hooks/pre-commit
set -e
echo "Running tests..."
cargo test --quiet
echo "Running clippy..."
cargo clippy -- -D warnings
echo "Checking format..."
cargo fmt -- --check
echo "All checks passed!"
See Also
Release Process
This guide covers how to release a new version of Matchy to crates.io using automated GitHub Actions workflows with trusted publishing.
Overview
Matchy uses trusted publishing to securely publish releases to crates.io without managing API tokens. When you push a version tag (like v1.0.0), GitHub Actions automatically:
- Creates a GitHub release
- Builds binaries for multiple platforms
- Publishes to crates.io using OIDC authentication
Prerequisites
One-Time Setup: Configure Trusted Publishing
Before your first release, you must configure trusted publishing on crates.io:
- Go to https://crates.io/crates/matchy/settings
- Navigate to the “Trusted Publishing” section
- Click “Add” and fill in:
- Repository owner:
matchylabs - Repository name:
matchy - Workflow filename:
release.yml - Environment:
release
- Repository owner:
- Click “Save”
This tells crates.io to trust releases from your GitHub Actions workflow.
Note: The GitHub
releaseenvironment has already been created in your repository.
Release Checklist
Before releasing, ensure:
- All tests pass:
cargo test - Benchmarks run successfully:
cargo bench - Documentation builds:
cargo doc --no-deps - CHANGELOG.md is updated with version changes
- README.md reflects current features
- No uncommitted changes
Creating a Release
1. Update the Version
Update the version in Cargo.toml:
[package]
name = "matchy"
version = "1.0.0" # Update this
2. Commit the Version Bump
git add Cargo.toml CHANGELOG.md
git commit -m "Release version 1.0.0"
git push origin main
3. Create and Push the Tag
# Create an annotated tag
git tag -a v1.0.0 -m "Release version 1.0.0"
# Push the tag (this triggers the release workflow)
git push origin v1.0.0
Important: The tag version must match the
Cargo.tomlversion. The workflow will fail if they don’t match (e.g., tagv1.0.0requiresversion = "1.0.0"in Cargo.toml).
What Happens Automatically
When you push the tag, the GitHub Actions workflow (.github/workflows/release.yml) runs three jobs:
Job 1: Create Release
- Creates a GitHub release for the tag
- Sets the release name and description
Job 2: Build CLI Binaries
Builds the matchy CLI for multiple platforms:
- Linux x86_64 (
.tar.gz) - Linux ARM64 (
.tar.gz) - cross-compiled - macOS x86_64 (
.tar.gz) - macOS ARM64 (
.tar.gz) - Windows x86_64 (
.zip)
All archives are attached to the GitHub release for users who want pre-built binaries.
Job 3: Publish to crates.io
- Verifies the tag version matches
Cargo.toml - Uses the
rust-lang/crates-io-auth-actionto authenticate via OIDC - Runs
cargo publishwith a short-lived token - No API tokens are stored in the repository!
Monitoring a Release
Watch the Workflow
Monitor the release progress:
# Open in browser
gh run watch
Or visit: https://github.com/matchylabs/matchy/actions
Verify Publication
After the workflow completes:
- Check crates.io: https://crates.io/crates/matchy
- Check GitHub release: https://github.com/matchylabs/matchy/releases
- Test installation:
cargo install matchy --force matchy --version
Troubleshooting
“Trusted publishing not configured”
Problem: The workflow fails with an authentication error.
Solution: Follow the Prerequisites section to configure trusted publishing on crates.io.
“Version mismatch”
Problem: The workflow fails with “Tag version does not match Cargo.toml version.”
Solution: Ensure the tag (e.g., v1.0.0) matches the version in Cargo.toml (e.g., version = "1.0.0"). Delete the tag, fix the version, and re-tag:
# Delete local and remote tag
git tag -d v1.0.0
git push origin :refs/tags/v1.0.0
# Fix Cargo.toml, commit, then re-tag
git tag -a v1.0.0 -m "Release version 1.0.0"
git push origin v1.0.0
“Permission denied” or OIDC errors
Problem: The workflow can’t authenticate with crates.io.
Solution: Verify that:
- The
releaseenvironment exists in your repository - The workflow has
id-token: writepermission (already set) - Trusted publishing is configured on crates.io with the correct repository and workflow name
Build failures
Problem: The build or tests fail during the workflow.
Solution: Test locally first:
# Run all checks locally
cargo test
cargo clippy -- -D warnings
cargo build --release
# Test cross-compilation (if needed)
cargo build --release --target x86_64-unknown-linux-gnu
Semantic Versioning
Matchy follows Semantic Versioning:
- MAJOR (1.0.0 → 2.0.0): Breaking API changes
- MINOR (1.0.0 → 1.1.0): New features, backwards compatible
- PATCH (1.0.0 → 1.0.1): Bug fixes, backwards compatible
When to Bump
- Major: Binary format changes, API removals, behavior changes
- Minor: New features, new APIs, performance improvements
- Patch: Bug fixes, documentation updates, internal refactoring
Pre-Releases
For testing before an official release:
# Use a pre-release version
version = "1.0.0-beta.1"
# Tag with the same format
git tag -a v1.0.0-beta.1 -m "Beta release"
git push origin v1.0.0-beta.1
Pre-release versions are published to crates.io but not marked as the “latest” version.
Yanking a Release
If you discover a critical issue after publishing:
# Yank the problematic version
cargo yank --vers 1.0.0
# Fix the issue, then release a new version
# Bump to 1.0.1 and follow the normal release process
Yanked versions remain available for existing users but won’t be installed for new users.
How Trusted Publishing Works
Under the hood:
-
GitHub Actions generates an OIDC token that cryptographically proves:
- The workflow is running from the
matchylabs/matchyrepository - It’s using the
release.ymlworkflow - It’s deploying to the
releaseenvironment
- The workflow is running from the
-
The
rust-lang/crates-io-auth-actionexchanges this OIDC token for a short-lived crates.io token (expires in 30 minutes) -
cargo publishuses this temporary token to upload the crate -
The token expires automatically - no cleanup needed!
This is more secure than API tokens because:
- No long-lived secrets to manage or rotate
- Tokens are scoped to specific repositories and workflows
- Cryptographic proof of workflow identity
- Automatic expiration prevents token reuse
See Also
- Testing - Run tests before releasing
- CI Checks - What CI validates
- Benchmarking - Performance validation
- GitHub Actions Workflows
- crates.io Trusted Publishing Docs
Frequently Asked Questions
General
What is Matchy?
Matchy is a database for IP address and string matching. It supports matching IP addresses, CIDR ranges, exact strings, and glob patterns with associated structured data.
How is Matchy different from MaxMind’s GeoIP?
Matchy can read standard MaxMind MMDB files and extends the format to support string matching and glob patterns. If you only need IP lookups, MaxMind’s libraries work great. If you also need string and pattern matching, Matchy provides that functionality.
Is Matchy production-ready?
Matchy is actively developed and used in production systems. The API is stable, and the binary format is versioned. Always test thoroughly in your specific environment.
Performance
How fast is Matchy?
Typical performance on modern hardware:
- 7M+ IP address lookups per second
- 1M+ pattern matches per second (with 50,000 patterns)
- Sub-microsecond latency for individual queries
- Sub-millisecond loading time via memory mapping
Actual performance depends on your hardware, database size, and query patterns.
Does Matchy work with multiple processes?
Yes. Matchy uses memory mapping, so the operating system automatically shares database pages across processes. 64 processes querying the same 100MB database will use approximately 100MB of RAM total, not 6,400MB.
What’s the maximum database size?
Matchy can handle databases larger than available RAM thanks to memory mapping. The practical limit depends on your system’s virtual address space (effectively unlimited on 64-bit systems).
Compatibility
Can I use Matchy with languages other than Rust?
Yes. Matchy provides a C API that can be called from any language with C FFI support. This includes C++, Python, Go, Node.js, and many others.
Does Matchy run on Windows?
Yes. Matchy supports Linux, macOS, and Windows (10+).
Database Format
What file format does Matchy use?
Matchy uses a compact binary format based on MaxMind’s MMDB specification. The format supports:
- IP address trees (compatible with MMDB)
- Hash tables for exact string matches (extension)
- Aho-Corasick automaton for patterns (extension)
- Structured data storage (compatible with MMDB)
Can I read Matchy databases from other tools?
Standard MaxMind MMDB readers can read the IP address portion of a Matchy database. The string and pattern matching features require using Matchy’s libraries.
Are databases portable across platforms?
Yes. Matchy databases are platform-independent binary files. A database built on Linux works on macOS and Windows without modification.
Entry Types
How do I match a string that contains wildcards literally?
Use the literal: prefix to force exact matching:
literal:file*.txt
This will match the literal string “file*.txt” instead of treating * as a wildcard.
How do I force a string to be treated as a pattern?
Use the glob: prefix:
glob:example.com
This forces “example.com” to be treated as a glob pattern instead of an exact string.
What are type prefixes and when should I use them?
Type prefixes (literal:, glob:, ip:) override Matchy’s automatic entry type detection.
Use them when:
- A string contains
*,?, or[that should be matched literally - You need consistent behavior across mixed data sources
- Auto-detection doesn’t match your intent
See Entry Types - Prefix Technique for details.
Changelog
All notable changes to matchy are documented here.
The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.
For detailed version history, see the full CHANGELOG.md in the repository.
[1.2.1] - 2025-10-28
Fixed
- Critical: Worker False Positive Bug
- Fixed bug where Worker was treating
QueryResult::NotFoundas a valid match - Affects batch processing and
matchy matchcommand accuracy - Now correctly distinguishes between matches and non-matches
- Fixed bug where Worker was treating
[1.2.0] - 2025-10-28
Added
- String Interning for Database Size Reduction
- Automatic deduplication of repeated string values in database data sections
- Significantly reduces database size for datasets with redundant metadata
- Zero query-time overhead - interning happens at build time
- Transparent to API users - no code changes required
Fixed
- Critical: Database Construction Bugs (discovered via fuzzing)
- Fixed UTF-8 boundary bug in case-insensitive glob pattern matching that could create malformed databases
- Added overflow/underflow validation in IP tree builder to prevent invalid pointer arithmetic
- Database builder now validates all record values before writing to prevent creating unreadable databases
- Enhanced input validation during database construction
- Improved error messages for invalid data pointer calculations
Changed
- Database loader now provides detailed error messages on invalid pointer arithmetic instead of panicking
- Improved error messages for invalid input during database building
- Better detection and reporting of malformed patterns and IP addresses
[1.1.0] - 2025-10-25
Added
-
matchy extractCommand for high-performance pattern extraction from logs- Extract domains, IPv4/IPv6 addresses, and email addresses from unstructured text
- Multiple output formats: JSON (NDJSON), CSV, plain text
- Configurable extraction types with
--typesflag (ipv4, ipv6, domain, email, all) - Deduplication mode with
--uniqueflag - Statistics reporting with
--statsflag - 200-500 MB/s typical throughput
-
Parallel Multi-File Processing for
matchy match-j/--threadsflag for parallel processing (default: auto-detect cores)- 2-8x faster throughput on multi-core systems
- Per-worker LRU caches for optimal performance
--batch-bytestuning option for large files
-
Follow Mode for
matchy match-f/--followflag for log tailing (liketail -f)- Monitors files for changes using file system notifications
- Processes new lines immediately as they are written
- Supports parallel processing with multiple files
-
Live Progress Reporting
-p/--progressflag shows live 3-line progress indicator- Displays lines processed, matches, hit rate, throughput, elapsed time
- Candidate breakdown (IPv4, IPv6, domains, emails)
- Query rate statistics
-
Query Result Caching for high-throughput workloads
- Configurable LRU cache with
Database::from().cache_capacity(size)builder API - Disable caching with
Database::from().no_cache()for memory-constrained environments clear_cache()method for cache management- Benchmarks show 2-10x speedup at 80%+ cache hit rates
- Configurable LRU cache with
-
Pattern Extractor API for log scanning and data extraction
- SIMD-accelerated extraction of domains, IPv4/IPv6 addresses, and email addresses
- Zero-copy line scanning with
memchrfor maximum throughput - Unicode/IDN domain support with automatic punycode conversion
- Binary log support (extracts ASCII patterns from non-UTF-8 data)
Performance
- AC Automaton Optimizations: 2.4% speedup from memory-locked automaton
- Parallel Processing: 2-8x speedup on multi-core systems
- Caching: 2-10x query speedup with 80%+ hit rates
[1.0.1] - 2025-10-14
Fixed
- Critical: IP Longest Prefix Match Bug (#10)
- Fixed insertion order dependency affecting IP address lookups
- More specific prefixes (e.g., /32) now correctly take precedence over less specific ones (e.g., /24)
- Affects both IPv4 and IPv6 lookups
- Internal fix only - no database format changes
Added
- Comprehensive test suite for longest prefix matching
- IPv6 longest prefix match tests
[1.0.0] - 2025-10-13
🎉 First Stable Release
Matchy 1.0.0 is production-ready! This major release includes database format updates and comprehensive validation infrastructure.
🚨 Breaking Changes
- Database Format: Updated binary format (databases from v0.5.x must be rebuilt)
- Match Mode Storage: Case sensitivity now stored in database metadata
Highlights
Validation System
- Three validation levels: Standard, Strict, and Audit
- Complete database integrity checking before loading
- CLI commands:
matchy validateandmatchy audit - C API:
matchy_validate()function - Prevents crashes from corrupted or malicious databases
Case-Insensitive Matching
- Build-time
-i/--case-insensitiveflag - Match mode persisted in database metadata
- Zero query-time overhead
- Automatic deduplication of case variants
Performance
- Validation: ~18-20ms on 193MB database (minimal impact)
- All 0.5.x performance characteristics maintained:
- 7M+ IP queries/second
- 1M+ pattern queries/second
- <100μs database loading
- 30-57% faster than 0.4.x pattern matching
Testing
- 163 tests passing (all unit, integration, and doc tests)
- 5 active fuzz targets
- Comprehensive validation coverage
[0.5.2] - 2025-10-12
Major Performance Improvements
- 30-57% faster pattern matching via state-specific AC encoding
- O(1) database loading with lazy offset-based lookups
- Trusted mode for 15-20% additional speedup (skips validation)
Critical Bug Fixes
- Fixed UTF-8 boundary panic in glob matching (found by fuzzing)
- Fixed exponential backtracking / OOM vulnerability (found by fuzzing)
Added
- Comprehensive
matchy benchcommand (900+ lines) - Fuzzing infrastructure with 5 fuzz targets
- Zero-copy optimizations with zerocopy 0.8
Database::open_trusted()API
[0.5.1] - 2025-10-11
Added
- cargo-c configuration for C/C++ library installation
- System-wide installation support:
cargo cinstall - Headers install to
/usr/local/include/matchy/
[0.5.0] - 2025-01-15
Major Performance Improvements
- 18x faster build times (424K patterns in ~1 second)
- 15x smaller databases (~72 MB vs 1.1 GB)
- 10-100x faster literal queries via O(1) hash lookup
Added
- Hybrid lookup architecture (hash table + Aho-Corasick + IP trie)
- Literal hash table for exact string matching
- CSV input format support
- MISP streaming import
- Enhanced CLI with JSON output and exit codes
[0.4.0] - 2025-01-10
Major Changes
- Project renamed from
paraglob-rstomatchy - Full MMDB integration for IP address lookups
- Unified database format (IP addresses + patterns)
- v3 format with zero-copy AC literal mapping
Added
- IP address and CIDR range matching (IPv4 and IPv6)
- MISP threat feed integration
- CLI tool:
matchy query,matchy inspect,matchy build - Rich structured data storage (MMDB-compatible encoding)
Performance
- 1.4M queries/sec with 10K patterns
- 1.5M IP lookups/sec
- <150μs database load time
Release Process
Releases follow Semantic Versioning:
- MAJOR (1.x): Incompatible API or format changes
- MINOR (x.1): New backward-compatible functionality
- PATCH (x.x.1): Backward-compatible bug fixes
See Also
Glossary
Database
A database is a binary file containing entries for IP addresses, CIDR ranges, exact
strings, and glob patterns, along with associated data. Databases are created with a
database builder and queried with the Database::lookup() method.
Database Builder
A database builder (DatabaseBuilder) is used to construct a new database.
You add entries to the builder, then call .build() to produce the final
database bytes.
Entry
An entry is a single item added to a database. An entry consists of a key (IP address, CIDR range, exact string, or glob pattern) and associated data. Matchy automatically detects the entry type based on the key format.
CIDR
CIDR (Classless Inter-Domain Routing) is a notation for specifying IP address ranges,
such as 192.0.2.0/24. The number after the slash indicates how many bits of the address
are fixed. Matchy supports both IPv4 and IPv6 CIDR ranges.
Pattern
A pattern is a string containing wildcard characters (* or ?) that can match multiple
input strings. For example, *.example.com matches foo.example.com, bar.example.com,
and any other subdomain of example.com.
Query
A query is a lookup operation on a database. You pass a string to
Database::lookup(), and Matchy returns matching data if found. The query automatically
checks IP addresses, CIDR ranges, exact strings, and patterns.
Match Mode
Match mode determines how string comparisons are performed. MatchMode::CaseSensitive
treats "ABC" and "abc" as different. MatchMode::CaseInsensitive treats them as the same.
Match mode is set when creating a database builder.
Memory Mapping
Memory mapping (mmap) is a technique that maps file contents directly into a process’s address space. Matchy uses memory mapping to load databases instantly without deserialization. The operating system shares memory-mapped pages across processes, reducing memory usage.
MMDB
MMDB (MaxMind Database) is a binary format for storing IP geolocation data, created by MaxMind. Matchy can read standard MMDB files and extends the format to support string matching and glob patterns.
Data Value
A data value is a piece of structured data associated with an entry. Matchy supports several data types including strings, integers, floats, booleans, arrays, and maps. Data values are stored in a compact binary format within the database.
Examples
This appendix contains complete examples demonstrating Matchy usage.
Threat Intelligence Database
Build a database of malicious IPs and domains:
use matchy::{Database, DatabaseBuilder, MatchMode, DataValue, QueryResult};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Add known malicious IP
let mut threat = HashMap::new();
threat.insert("severity".to_string(), DataValue::String("critical".to_string()));
threat.insert("type".to_string(), DataValue::String("c2_server".to_string()));
builder.add_entry("198.51.100.1", threat)?;
// Add botnet CIDR range
let mut botnet = HashMap::new();
botnet.insert("severity".to_string(), DataValue::String("high".to_string()));
botnet.insert("type".to_string(), DataValue::String("botnet".to_string()));
builder.add_entry("203.0.113.0/24", botnet)?;
// Add phishing domain pattern
let mut phishing = HashMap::new();
phishing.insert("category".to_string(), DataValue::String("phishing".to_string()));
builder.add_entry("*.phishing-site.com", phishing)?;
// Build and save
let db_bytes = builder.build()?;
std::fs::write("threats.mxy", &db_bytes)?;
// Query
let db = Database::open("threats.mxy")?;
if let Some(QueryResult::Ip { data, .. }) = db.lookup("198.51.100.1")? {
println!("Threat found: {:?}", data);
}
if let Some(QueryResult::Pattern { data, .. }) = db.lookup("login.phishing-site.com")? {
println!("Phishing site: {:?}", data[0]);
}
Ok(())
}
GeoIP Database
Query a MaxMind GeoIP database:
use matchy::{Database, QueryResult};
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Open a standard MaxMind GeoLite2 database
let db = Database::open("GeoLite2-City.mmdb")?;
// Look up IP address
match db.lookup("8.8.8.8")? {
Some(QueryResult::Ip { data, prefix_len }) => {
println!("IP: 8.8.8.8/{}", prefix_len);
println!("Data: {:#?}", data);
}
_ => println!("Not found"),
}
Ok(())
}
Multi-Pattern Matching
Match against thousands of patterns efficiently:
use matchy::{DatabaseBuilder, Database, MatchMode, DataValue};
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = DatabaseBuilder::new(MatchMode::CaseInsensitive);
// Add thousands of malicious domain patterns
for i in 0..50_000 {
let mut data = HashMap::new();
data.insert("id".to_string(), DataValue::Uint32(i));
builder.add_entry(&format!("*.malware{}.com", i), data)?;
}
let db_bytes = builder.build()?;
std::fs::write("patterns.mxy", &db_bytes)?;
let db = Database::open("patterns.mxy")?;
// Query against 50,000 patterns - still fast!
let start = std::time::Instant::now();
let result = db.lookup("subdomain.malware42.com")?;
println!("Query time: {:?}", start.elapsed());
println!("Result: {:?}", result);
Ok(())
}
See the repository examples directory for more complete examples.