Aurix - Voice Recording Implementation
Overview
Aurix's voice recording system provides a seamless, low-latency experience for capturing user thoughts. The implementation supports multiple recording modes, real-time feedback, and efficient audio processing while maintaining privacy through local-only processing.
Recording Architecture
Recording Modes
1. Push-to-Talk (PTT)
Hold a key/button to record, release to stop.
class PushToTalkRecorder {
private isKeyPressed = false;
private recordingTimeout?: NodeJS.Timeout;
setupGlobalShortcut() {
// Register global hotkey (F4 by default)
globalShortcut.register('F4', () => {
if (!this.isKeyPressed) {
this.startRecording();
this.isKeyPressed = true;
}
});
// Use raw key events to detect release
app.on('browser-window-focus', () => {
const window = BrowserWindow.getFocusedWindow();
if (window) {
window.webContents.on('before-input-event', (event, input) => {
if (input.key === 'F4' && input.type === 'keyUp') {
this.stopRecording();
this.isKeyPressed = false;
}
});
}
});
}
}
2. Click-to-Toggle
Single click to start, click again to stop.
class ToggleRecorder {
private isRecording = false;
private startTime?: number;
toggle() {
if (!this.isRecording) {
this.startRecording();
this.startTime = Date.now();
} else {
const duration = Date.now() - this.startTime!;
if (duration < 500) {
// Prevent accidental double-clicks
return;
}
this.stopRecording();
}
this.isRecording = !this.isRecording;
}
}
3. Voice Activity Detection (VAD)
Automatically start/stop based on speech detection.
Audio Capture Implementation
Web Audio API Setup
class AudioCapture {
private audioContext: AudioContext;
private mediaStream?: MediaStream;
private source?: MediaStreamAudioSourceNode;
private processor?: ScriptProcessorNode;
private worklet?: AudioWorkletNode;
async initialize() {
// Request microphone access
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
sampleRate: 16000, // Optimal for Whisper
channelCount: 1 // Mono is sufficient
}
});
// Create audio context
this.audioContext = new AudioContext({ sampleRate: 16000 });
this.source = this.audioContext.createMediaStreamSource(this.mediaStream);
// Use AudioWorklet for better performance
await this.setupAudioWorklet();
}
private async setupAudioWorklet() {
// Load custom processor
await this.audioContext.audioWorklet.addModule('audio-processor.js');
this.worklet = new AudioWorkletNode(this.audioContext, 'audio-processor', {
processorOptions: {
bufferSize: 2048,
numberOfChannels: 1
}
});
// Connect the chain
this.source!.connect(this.worklet);
// Handle processed audio
this.worklet.port.onmessage = (event) => {
this.handleAudioData(event.data);
};
}
}
Audio Worklet Processor
// audio-processor.js
class AudioProcessor extends AudioWorkletProcessor {
constructor(options) {
super();
this.bufferSize = options.processorOptions.bufferSize;
this.buffer = new Float32Array(this.bufferSize);
this.bufferIndex = 0;
}
process(inputs, outputs, parameters) {
const input = inputs[0];
if (!input || !input[0]) return true;
const samples = input[0];
for (let i = 0; i < samples.length; i++) {
this.buffer[this.bufferIndex++] = samples[i];
if (this.bufferIndex >= this.bufferSize) {
// Send buffer to main thread
this.port.postMessage({
type: 'audio-data',
buffer: this.buffer.slice(),
timestamp: currentTime
});
this.bufferIndex = 0;
}
}
return true; // Keep processor alive
}
}
registerProcessor('audio-processor', AudioProcessor);
Audio Processing Pipeline
Real-Time Audio Enhancement
class AudioEnhancer {
private noiseGate: NoiseGate;
private compressor: DynamicsCompressorNode;
private gain: GainNode;
constructor(context: AudioContext) {
// Noise gate to remove background noise
this.noiseGate = new NoiseGate(context, {
threshold: -50, // dB
ratio: 10,
attack: 0.003,
release: 0.1
});
// Compressor for consistent volume
this.compressor = context.createDynamicsCompressor();
this.compressor.threshold.value = -20;
this.compressor.knee.value = 10;
this.compressor.ratio.value = 4;
this.compressor.attack.value = 0.003;
this.compressor.release.value = 0.1;
// Output gain
this.gain = context.createGain();
this.gain.gain.value = 1.0;
}
connect(source: AudioNode, destination: AudioNode) {
source
.connect(this.noiseGate.input)
.connect(this.compressor)
.connect(this.gain)
.connect(destination);
}
}
Voice Activity Detection (VAD)
class VoiceActivityDetector {
private energyThreshold = 0.01;
private zeroCrossingThreshold = 50;
private silenceTimeout = 2000; // 2 seconds
private lastVoiceTime = 0;
detectVoice(samples: Float32Array): boolean {
const energy = this.calculateEnergy(samples);
const zeroCrossings = this.calculateZeroCrossings(samples);
// Simple VAD based on energy and zero crossings
const hasVoice = energy > this.energyThreshold &&
zeroCrossings > this.zeroCrossingThreshold;
if (hasVoice) {
this.lastVoiceTime = Date.now();
}
return hasVoice;
}
private calculateEnergy(samples: Float32Array): number {
let sum = 0;
for (let i = 0; i < samples.length; i++) {
sum += samples[i] * samples[i];
}
return Math.sqrt(sum / samples.length);
}
private calculateZeroCrossings(samples: Float32Array): number {
let crossings = 0;
for (let i = 1; i < samples.length; i++) {
if ((samples[i] >= 0) !== (samples[i - 1] >= 0)) {
crossings++;
}
}
return crossings;
}
isSilent(): boolean {
return Date.now() - this.lastVoiceTime > this.silenceTimeout;
}
}
Buffer Management
Ring Buffer Implementation
class AudioRingBuffer {
private buffer: Float32Array;
private writeIndex = 0;
private readIndex = 0;
private availableSamples = 0;
constructor(private size: number) {
this.buffer = new Float32Array(size);
}
write(samples: Float32Array): boolean {
if (this.availableSpace() < samples.length) {
return false; // Buffer full
}
for (let i = 0; i < samples.length; i++) {
this.buffer[this.writeIndex] = samples[i];
this.writeIndex = (this.writeIndex + 1) % this.size;
this.availableSamples++;
}
return true;
}
read(count: number): Float32Array | null {
if (this.availableSamples < count) {
return null; // Not enough data
}
const result = new Float32Array(count);
for (let i = 0; i < count; i++) {
result[i] = this.buffer[this.readIndex];
this.readIndex = (this.readIndex + 1) % this.size;
this.availableSamples--;
}
return result;
}
availableSpace(): number {
return this.size - this.availableSamples;
}
}
Chunk Aggregation for Whisper
WAV File Encoding
class WavEncoder {
static encode(samples: Float32Array, sampleRate: number): ArrayBuffer {
const length = samples.length;
const arrayBuffer = new ArrayBuffer(44 + length * 2);
const view = new DataView(arrayBuffer);
// WAV header
const writeString = (offset: number, string: string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + length * 2, true); // File size
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true); // Subchunk size
view.setUint16(20, 1, true); // Audio format (PCM)
view.setUint16(22, 1, true); // Channels (mono)
view.setUint32(24, sampleRate, true); // Sample rate
view.setUint32(28, sampleRate * 2, true); // Byte rate
view.setUint16(32, 2, true); // Block align
view.setUint16(34, 16, true); // Bits per sample
writeString(36, 'data');
view.setUint32(40, length * 2, true); // Data size
// Convert float samples to 16-bit PCM
let offset = 44;
for (let i = 0; i < length; i++) {
const sample = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(offset, sample * 0x7FFF, true);
offset += 2;
}
return arrayBuffer;
}
}
Whisper Integration
Streaming Transcription
class WhisperTranscriber {
private whisperProcess?: ChildProcess;
private partialTranscript = '';
private transcriptSubject = new Subject<TranscriptUpdate>();
async initialize() {
// Use whisper.cpp with streaming mode
this.whisperProcess = spawn('whisper', [
'--model', this.getModelPath(),
'--language', 'en',
'--threads', '4',
'--stream',
'--step', '500', // Process every 500ms
'--length', '3000', // 3 second chunks
'-' // Read from stdin
]);
this.whisperProcess.stdout?.on('data', (data) => {
this.handleTranscription(data.toString());
});
}
async transcribe(audioBuffer: ArrayBuffer) {
if (!this.whisperProcess) {
throw new Error('Whisper not initialized');
}
// Send audio data to whisper process
this.whisperProcess.stdin?.write(Buffer.from(audioBuffer));
}
private handleTranscription(text: string) {
// Parse whisper output
const lines = text.split('\n').filter(line => line.trim());
for (const line of lines) {
if (line.startsWith('[')) {
// Timestamp line, extract transcription
const match = line.match(/\[[\d:,]+ --> [\d:,]+\]\s+(.*)/);
if (match) {
const transcript = match[1].trim();
this.transcriptSubject.next({
type: 'partial',
text: transcript,
timestamp: Date.now()
});
}
}
}
}
}
Model Management
class WhisperModelManager {
private modelPath = path.join(app.getPath('userData'), 'models');
async ensureModel(type: ModelType = 'tiny'): Promise<string> {
const modelFile = `ggml-${type}.bin`;
const fullPath = path.join(this.modelPath, modelFile);
if (await fs.pathExists(fullPath)) {
return fullPath;
}
// Download model
await this.downloadModel(type);
return fullPath;
}
private async downloadModel(type: ModelType) {
const urls: Record<ModelType, string> = {
tiny: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
base: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
small: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin'
};
const response = await fetch(urls[type]);
const buffer = await response.arrayBuffer();
// Verify checksum
const hash = crypto.createHash('sha256').update(Buffer.from(buffer)).digest('hex');
if (hash !== EXPECTED_HASHES[type]) {
throw new Error('Model checksum mismatch');
}
await fs.writeFile(path.join(this.modelPath, `ggml-${type}.bin`), Buffer.from(buffer));
}
selectOptimalModel(): ModelType {
const memory = os.totalmem();
const cpus = os.cpus().length;
if (memory > 16 * 1024 * 1024 * 1024 && cpus >= 8) {
return 'small';
} else if (memory > 8 * 1024 * 1024 * 1024 && cpus >= 4) {
return 'base';
}
return 'tiny';
}
}
Platform-Specific Implementations
macOS Optimizations
class MacOSAudioHandler {
async requestPermissions() {
const { systemPreferences } = require('electron');
const microphoneStatus = systemPreferences.getMediaAccessStatus('microphone');
if (microphoneStatus !== 'granted') {
await systemPreferences.askForMediaAccess('microphone');
}
}
setupAudioSession() {
// Use CoreAudio for better performance on macOS
if (process.platform === 'darwin') {
// Set audio session category for recording
app.commandLine.appendSwitch('audio-session-category', 'record');
// Enable hardware acceleration for Metal-enabled Macs
app.commandLine.appendSwitch('enable-features', 'Metal');
}
}
}
Windows Audio Session
class WindowsAudioHandler {
setupWASAPI() {
if (process.platform === 'win32') {
// Use WASAPI for lower latency on Windows
app.commandLine.appendSwitch('audio-backend', 'wasapi');
// Set process priority for better audio performance
const { exec } = require('child_process');
exec(`wmic process where "name='${app.getName()}.exe'" call setpriority "high priority"`);
}
}
}
Linux ALSA/PulseAudio
class LinuxAudioHandler {
async detectAudioSystem(): Promise<'alsa' | 'pulse' | 'pipewire'> {
try {
await exec('pactl info');
return 'pulse';
} catch {
try {
await exec('pipewire --version');
return 'pipewire';
} catch {
return 'alsa';
}
}
}
}
Error Handling and Recovery
Common Audio Errors
class AudioErrorHandler {
async handleError(error: Error): Promise<RecoveryAction> {
if (error.name === 'NotAllowedError') {
// Microphone permission denied
return {
action: 'request-permission',
message: 'Microphone access required for voice recording'
};
}
if (error.name === 'NotFoundError') {
// No microphone found
return {
action: 'show-settings',
message: 'No microphone detected. Please connect a microphone.'
};
}
if (error.name === 'NotReadableError') {
// Microphone in use by another app
return {
action: 'retry-delayed',
message: 'Microphone is being used by another application'
};
}
// Default fallback
return {
action: 'show-error',
message: 'An error occurred during audio recording'
};
}
}
Recovery Strategies
Performance Monitoring
Audio Pipeline Metrics
interface AudioMetrics {
captureLatency: number;
processingLatency: number;
transcriptionLatency: number;
dropoutCount: number;
bufferUnderrunCount: number;
cpuUsage: number;
memoryUsage: number;
}
class AudioPerformanceMonitor {
private metrics: AudioMetrics = {
captureLatency: 0,
processingLatency: 0,
transcriptionLatency: 0,
dropoutCount: 0,
bufferUnderrunCount: 0,
cpuUsage: 0,
memoryUsage: 0
};
private metricsInterval?: NodeJS.Timer;
start() {
this.metricsInterval = setInterval(() => {
this.updateMetrics();
this.checkPerformance();
}, 1000);
}
private checkPerformance() {
if (this.metrics.captureLatency > 50) {
console.warn('High capture latency detected:', this.metrics.captureLatency);
}
if (this.metrics.dropoutCount > 0) {
console.error('Audio dropouts detected:', this.metrics.dropoutCount);
this.optimizePerformance();
}
}
private optimizePerformance() {
// Lower sample rate if needed
// Increase buffer size
// Reduce processing complexity
}
}
User Interface Integration
Recording Status Indicator
class RecordingUI {
private statusElement: HTMLElement;
private waveformCanvas: HTMLCanvasElement;
private animationFrame?: number;
updateStatus(state: RecordingState) {
switch (state) {
case 'idle':
this.statusElement.className = 'recording-idle';
this.statusElement.textContent = 'Click to record';
break;
case 'recording':
this.statusElement.className = 'recording-active';
this.statusElement.textContent = 'Recording...';
this.startWaveformAnimation();
break;
case 'processing':
this.statusElement.className = 'recording-processing';
this.statusElement.textContent = 'Processing...';
break;
}
}
private startWaveformAnimation() {
const ctx = this.waveformCanvas.getContext('2d')!;
const width = this.waveformCanvas.width;
const height = this.waveformCanvas.height;
const animate = () => {
ctx.clearRect(0, 0, width, height);
// Draw waveform from audio data
ctx.beginPath();
ctx.strokeStyle = '#4CAF50';
ctx.lineWidth = 2;
for (let i = 0; i < this.audioSamples.length; i++) {
const x = (i / this.audioSamples.length) * width;
const y = (1 + this.audioSamples[i]) * height / 2;
if (i === 0) {
ctx.moveTo(x, y);
} else {
ctx.lineTo(x, y);
}
}
ctx.stroke();
this.animationFrame = requestAnimationFrame(animate);
};
animate();
}
}
Keyboard Shortcuts
class KeyboardShortcuts {
private shortcuts: Map<string, () => void> = new Map();
register() {
// Primary recording shortcut
this.shortcuts.set('F4', () => this.toggleRecording());
// Alternative shortcuts
this.shortcuts.set('CommandOrControl+Shift+R', () => this.toggleRecording());
this.shortcuts.set('CommandOrControl+Shift+S', () => this.stopRecording());
this.shortcuts.set('CommandOrControl+Shift+P', () => this.pauseRecording());
// Register with Electron
this.shortcuts.forEach((handler, key) => {
globalShortcut.register(key, handler);
});
}
customizeShortcut(action: string, newKey: string) {
// Allow users to customize shortcuts
const oldKey = Array.from(this.shortcuts.entries())
.find(([_, handler]) => handler.name === action)?.[0];
if (oldKey) {
globalShortcut.unregister(oldKey);
this.shortcuts.delete(oldKey);
}
this.shortcuts.set(newKey, this[action]);
globalShortcut.register(newKey, this[action]);
}
}
Testing Strategy
Audio Recording Tests
describe('Audio Recording', () => {
let recorder: AudioRecorder;
beforeEach(() => {
recorder = new AudioRecorder();
});
test('should initialize audio context', async () => {
await recorder.initialize();
expect(recorder.audioContext).toBeDefined();
expect(recorder.audioContext.sampleRate).toBe(16000);
});
test('should handle permission denial gracefully', async () => {
// Mock permission denial
navigator.mediaDevices.getUserMedia = jest.fn()
.mockRejectedValue(new DOMException('Permission denied', 'NotAllowedError'));
await expect(recorder.initialize()).rejects.toThrow('Permission denied');
});
test('should detect voice activity', () => {
const vad = new VoiceActivityDetector();
// Test with silence
const silence = new Float32Array(1024).fill(0);
expect(vad.detectVoice(silence)).toBe(false);
// Test with voice
const voice = new Float32Array(1024);
for (let i = 0; i < voice.length; i++) {
voice[i] = Math.sin(2 * Math.PI * 440 * i / 16000) * 0.5;
}
expect(vad.detectVoice(voice)).toBe(true);
});
});
Conclusion
Aurix's voice recording implementation provides a robust, low-latency solution for capturing user thoughts. Key features include:
- Multiple Recording Modes: Push-to-talk, toggle, and voice-activated
- Real-Time Processing: Streaming transcription with minimal latency
- Cross-Platform: Optimized for Windows, macOS, and Linux
- Privacy-First: All processing happens locally
- Error Resilient: Comprehensive error handling and recovery
The architecture ensures that users can effortlessly capture their ideas without technical barriers, maintaining Aurix's promise of frictionless documentation.