KnowledgeRefinery/daemon-go/internal/storage/vectorstore.go
oho 38a99476d6 Knowledge Refinery: local-first semantic search & 3D concept visualization
macOS app for corpus ingestion, semantic search, and concept universe
visualization powered by local LLMs via LM Studio.

Architecture:
- Go daemon (17MB single binary, zero dependencies)
  - chi router, pure-Go SQLite, tiktoken tokenizer
  - 6-stage pipeline: scan → extract → chunk → embed → annotate → conceptualize
  - Brute-force cosine vector search in memory
  - 89 tests across 8 packages
- SwiftUI app (macOS 15+)
  - Multi-workspace management with auto-start daemons
  - Live pipeline progress, search, concept browser
  - WebGPU 3D universe renderer with Canvas2D fallback
  - Custom crystal app icon
2026-02-13 18:09:46 +01:00

270 lines
6.4 KiB
Go

package storage
import (
"database/sql"
"encoding/binary"
"fmt"
"math"
"sort"
"sync"
)
// VectorRecord holds a chunk's embedding and metadata.
type VectorRecord struct {
ID string `json:"id"`
Vector []float32 `json:"-"`
Text string `json:"text"`
AssetID string `json:"asset_id"`
AssetPath string `json:"asset_path"`
EvidenceAnchor string `json:"evidence_anchor"`
Topics string `json:"topics"`
AtomType string `json:"atom_type"`
PipelineVersion string `json:"pipeline_version"`
}
// SearchResult is a VectorRecord with a distance score.
type SearchResult struct {
VectorRecord
Distance float64 `json:"_distance"`
}
// VectorStore stores and searches embeddings using SQLite + in-memory index.
type VectorStore struct {
db *sql.DB
dimension int
mu sync.RWMutex
cache []cachedVec // in-memory normalized vectors for fast search
loaded bool
}
type cachedVec struct {
id string
vector []float32 // pre-normalized
rec VectorRecord
}
const vectorTableDDL = `
CREATE TABLE IF NOT EXISTS chunk_vectors (
id TEXT PRIMARY KEY,
vector BLOB NOT NULL,
text TEXT NOT NULL,
asset_id TEXT NOT NULL,
asset_path TEXT NOT NULL,
evidence_anchor TEXT,
topics TEXT,
atom_type TEXT,
pipeline_version TEXT
);
CREATE INDEX IF NOT EXISTS idx_chunk_vectors_asset ON chunk_vectors(asset_id);
`
func NewVectorStore(db *sql.DB, dimension int) (*VectorStore, error) {
if _, err := db.Exec(vectorTableDDL); err != nil {
return nil, fmt.Errorf("create vector table: %w", err)
}
vs := &VectorStore{db: db, dimension: dimension}
return vs, nil
}
func (vs *VectorStore) SetDimension(dim int) {
vs.mu.Lock()
vs.dimension = dim
vs.mu.Unlock()
}
func (vs *VectorStore) Dimension() int {
vs.mu.RLock()
defer vs.mu.RUnlock()
return vs.dimension
}
// LoadAll loads all vectors from SQLite into memory for fast search.
func (vs *VectorStore) LoadAll() error {
rows, err := vs.db.Query("SELECT id, vector, text, asset_id, asset_path, evidence_anchor, topics, atom_type, pipeline_version FROM chunk_vectors")
if err != nil {
return err
}
defer rows.Close()
var cache []cachedVec
for rows.Next() {
var rec VectorRecord
var vecBlob []byte
err := rows.Scan(&rec.ID, &vecBlob, &rec.Text, &rec.AssetID, &rec.AssetPath,
&rec.EvidenceAnchor, &rec.Topics, &rec.AtomType, &rec.PipelineVersion)
if err != nil {
return err
}
vec := blobToFloat32(vecBlob)
rec.Vector = vec
normalized := normalize(vec)
cache = append(cache, cachedVec{id: rec.ID, vector: normalized, rec: rec})
}
vs.mu.Lock()
vs.cache = cache
vs.loaded = true
if len(cache) > 0 {
vs.dimension = len(cache[0].vector)
}
vs.mu.Unlock()
return rows.Err()
}
// AddVectors inserts vectors into SQLite and adds them to the in-memory cache.
func (vs *VectorStore) AddVectors(records []VectorRecord) error {
tx, err := vs.db.Begin()
if err != nil {
return err
}
stmt, err := tx.Prepare(`
INSERT OR REPLACE INTO chunk_vectors
(id, vector, text, asset_id, asset_path, evidence_anchor, topics, atom_type, pipeline_version)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
vs.mu.Lock()
defer vs.mu.Unlock()
for _, rec := range records {
blob := float32ToBlob(rec.Vector)
_, err := stmt.Exec(rec.ID, blob, rec.Text, rec.AssetID, rec.AssetPath,
rec.EvidenceAnchor, rec.Topics, rec.AtomType, rec.PipelineVersion)
if err != nil {
tx.Rollback()
return err
}
// Add to cache
normalized := normalize(rec.Vector)
vs.cache = append(vs.cache, cachedVec{id: rec.ID, vector: normalized, rec: rec})
}
return tx.Commit()
}
// Search finds the k nearest neighbors using brute-force cosine similarity.
func (vs *VectorStore) Search(queryVec []float32, limit int) []SearchResult {
normalized := normalize(queryVec)
vs.mu.RLock()
defer vs.mu.RUnlock()
type scored struct {
idx int
dist float64
}
scores := make([]scored, 0, len(vs.cache))
for i, cv := range vs.cache {
sim := dotProduct(normalized, cv.vector)
// Convert cosine similarity to distance (lower = more similar, matching LanceDB behavior)
dist := 1.0 - float64(sim)
scores = append(scores, scored{idx: i, dist: dist})
}
sort.Slice(scores, func(i, j int) bool {
return scores[i].dist < scores[j].dist
})
n := limit
if n > len(scores) {
n = len(scores)
}
results := make([]SearchResult, n)
for i := 0; i < n; i++ {
cv := vs.cache[scores[i].idx]
results[i] = SearchResult{
VectorRecord: cv.rec,
Distance: scores[i].dist,
}
}
return results
}
// DeleteByAsset removes all vectors for an asset.
func (vs *VectorStore) DeleteByAsset(assetID string) error {
_, err := vs.db.Exec("DELETE FROM chunk_vectors WHERE asset_id=?", assetID)
if err != nil {
return err
}
// Remove from cache
vs.mu.Lock()
filtered := vs.cache[:0]
for _, cv := range vs.cache {
if cv.rec.AssetID != assetID {
filtered = append(filtered, cv)
}
}
vs.cache = filtered
vs.mu.Unlock()
return nil
}
// Count returns the number of vectors.
func (vs *VectorStore) Count() int {
vs.mu.RLock()
defer vs.mu.RUnlock()
return len(vs.cache)
}
// GetAllVectors returns all cached vectors (ids, vectors, texts).
func (vs *VectorStore) GetAllVectors() (ids []string, vectors [][]float32, texts []string) {
vs.mu.RLock()
defer vs.mu.RUnlock()
ids = make([]string, len(vs.cache))
vectors = make([][]float32, len(vs.cache))
texts = make([]string, len(vs.cache))
for i, cv := range vs.cache {
ids[i] = cv.id
vectors[i] = cv.rec.Vector // original (unnormalized)
texts[i] = cv.rec.Text
}
return
}
// -- Vector math helpers --
func float32ToBlob(v []float32) []byte {
buf := make([]byte, len(v)*4)
for i, f := range v {
binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f))
}
return buf
}
func blobToFloat32(b []byte) []float32 {
n := len(b) / 4
v := make([]float32, n)
for i := 0; i < n; i++ {
v[i] = math.Float32frombits(binary.LittleEndian.Uint32(b[i*4:]))
}
return v
}
func normalize(v []float32) []float32 {
var norm float64
for _, x := range v {
norm += float64(x) * float64(x)
}
norm = math.Sqrt(norm)
if norm == 0 {
out := make([]float32, len(v))
return out
}
out := make([]float32, len(v))
for i, x := range v {
out[i] = float32(float64(x) / norm)
}
return out
}
func dotProduct(a, b []float32) float32 {
var sum float32
for i := range a {
sum += a[i] * b[i]
}
return sum
}