From 5d9c716a72b9b04e7958897b76ec770650038c86 Mon Sep 17 00:00:00 2001 From: TopherMayor Date: Wed, 18 Feb 2026 01:55:33 +0000 Subject: [PATCH] feat: add homelab configuration and transcription support - Add host-based delegate agents (ubuntu, grizzley, truenas, panda, pve) - Add functional delegates (coder, reasoner, research, quick) - Add NanoGPT provider with minimax-m2.5 model - Add transcribe tool using faster-whisper - Update TelegramChannel with workspace_dir - Configure Z.AI as default provider with glm-5 --- src/channels/mod.rs | 2 + src/channels/telegram.rs | 5 +- src/cron/scheduler.rs | 2 +- src/providers/mod.rs | 7 + src/tools/mod.rs | 4 + src/tools/transcribe.rs | 283 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 300 insertions(+), 3 deletions(-) create mode 100644 src/tools/transcribe.rs diff --git a/src/channels/mod.rs b/src/channels/mod.rs index d63f63d..76c281a 100644 --- a/src/channels/mod.rs +++ b/src/channels/mod.rs @@ -633,6 +633,7 @@ pub async fn doctor_channels(config: Config) -> Result<()> { Arc::new(TelegramChannel::new( tg.bot_token.clone(), tg.allowed_users.clone(), + Some(config.workspace_dir.clone()), )), )); } @@ -923,6 +924,7 @@ pub async fn start_channels(config: Config) -> Result<()> { channels.push(Arc::new(TelegramChannel::new( tg.bot_token.clone(), tg.allowed_users.clone(), + Some(config.workspace_dir.clone()), ))); } diff --git a/src/channels/telegram.rs b/src/channels/telegram.rs index 5d25de1..b256b65 100644 --- a/src/channels/telegram.rs +++ b/src/channels/telegram.rs @@ -179,18 +179,19 @@ fn parse_attachment_markers(message: &str) -> (String, Vec) } /// Telegram channel — long-polls the Bot API for updates -pub struct TelegramChannel { +pub struct TelegramChannel { workspace_dir: std::path::PathBuf, bot_token: String, allowed_users: Vec, client: reqwest::Client, } impl TelegramChannel { - pub fn new(bot_token: String, allowed_users: Vec) -> Self { + pub fn new(bot_token: String, allowed_users: Vec, workspace_dir: Option) -> Self { Self { bot_token, allowed_users, client: reqwest::Client::new(), + workspace_dir: workspace_dir.unwrap_or_else(|| std::path::PathBuf::from("/tmp")), } } diff --git a/src/cron/scheduler.rs b/src/cron/scheduler.rs index 4562dba..9483932 100644 --- a/src/cron/scheduler.rs +++ b/src/cron/scheduler.rs @@ -231,7 +231,7 @@ async fn deliver_if_configured(config: &Config, job: &CronJob, output: &str) -> .telegram .as_ref() .ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?; - let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone()); + let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone(), Some(config.workspace_dir.clone())); channel.send(output, target).await?; } "discord" => { diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 07c427d..07a5a5d 100644 --- a/src/providers/mod.rs +++ b/src/providers/mod.rs @@ -135,6 +135,7 @@ fn resolve_provider_credential(name: &str, credential_override: Option<&str>) -> "zai" | "z.ai" => vec!["ZAI_API_KEY"], "nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"], "synthetic" => vec!["SYNTHETIC_API_KEY"], + "nanogpt" | "nano-gpt" => vec!["NANO_GPT_API_KEY"], "opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"], "vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"], "cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"], @@ -246,6 +247,12 @@ pub fn create_provider_with_url( key, AuthStyle::Bearer, ))), + "nanogpt" | "nano-gpt" => Ok(Box::new(OpenAiCompatibleProvider::new( + "NanoGPT", + "https://nano-gpt.com/api/v1", + key, + AuthStyle::Bearer, + ))), "bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new( "Amazon Bedrock", "https://bedrock-runtime.us-east-1.amazonaws.com", diff --git a/src/tools/mod.rs b/src/tools/mod.rs index b541736..26e8600 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -25,6 +25,7 @@ pub mod schema; pub mod screenshot; pub mod shell; pub mod traits; +pub mod transcribe; pub use browser::{BrowserTool, ComputerUseConfig}; pub use browser_open::BrowserOpenTool; @@ -53,6 +54,7 @@ pub use schema::{CleaningStrategy, SchemaCleanr}; pub use screenshot::ScreenshotTool; pub use shell::ShellTool; pub use traits::Tool; +pub use transcribe::TranscribeTool; #[allow(unused_imports)] pub use traits::{ToolResult, ToolSpec}; @@ -191,6 +193,8 @@ pub fn all_tools_with_runtime( tools.push(Box::new(ScreenshotTool::new(security.clone()))); tools.push(Box::new(ImageInfoTool::new(security.clone()))); + tools.push(Box::new(TranscribeTool::new(security.clone(), None, None))); + if let Some(key) = composio_key { if !key.is_empty() { tools.push(Box::new(ComposioTool::new(key, composio_entity_id))); diff --git a/src/tools/transcribe.rs b/src/tools/transcribe.rs new file mode 100644 index 0000000..f9c6ece --- /dev/null +++ b/src/tools/transcribe.rs @@ -0,0 +1,283 @@ +use super::traits::{Tool, ToolResult}; +use crate::security::SecurityPolicy; +use async_trait::async_trait; +use serde_json::json; +use std::path::Path; +use std::sync::Arc; +use tokio::process::Command; + +const MAX_AUDIO_BYTES: u64 = 104_857_600; +const SUPPORTED_FORMATS: &[&str] = &["mp3", "wav", "m4a", "flac", "ogg", "webm", "mp4", "mpeg", "mpga"]; + +pub struct TranscribeTool { + security: Arc, + model: String, + device: String, +} + +impl TranscribeTool { + pub fn new(security: Arc, model: Option, device: Option) -> Self { + Self { + security, + model: model.unwrap_or_else(|| "base".to_string()), + device: device.unwrap_or_else(|| "cpu".to_string()), + } + } + + fn is_supported_format(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .map(|ext| SUPPORTED_FORMATS.contains(&ext.to_lowercase().as_str())) + .unwrap_or(false) + } + + fn transcription_script() -> &'static str { + r#" +import sys +import json + +def transcribe(audio_path, model_size, device): + from faster_whisper import WhisperModel + + model = WhisperModel(model_size, device=device, compute_type="int8") + segments, info = model.transcribe(audio_path, beam_size=5) + + transcription = [] + for segment in segments: + transcription.append({ + "start": round(segment.start, 2), + "end": round(segment.end, 2), + "text": segment.text.strip() + }) + + result = { + "language": info.language, + "language_probability": round(info.language_probability, 2), + "duration": round(info.duration, 2), + "segments": transcription, + "text": " ".join(s["text"] for s in transcription) + } + + print(json.dumps(result)) + +if __name__ == "__main__": + if len(sys.argv) != 4: + print(json.dumps({"error": "Usage: script.py "})) + sys.exit(1) + + transcribe(sys.argv[1], sys.argv[2], sys.argv[3]) +"# + } +} + +#[async_trait] +impl Tool for TranscribeTool { + fn name(&self) -> &str { + "transcribe" + } + + fn description(&self) -> &str { + "Transcribe audio files to text using faster-whisper. \ + Supports mp3, wav, m4a, flac, ogg, webm, and other common audio formats. \ + Returns the transcription with timestamps and detected language." + } + + fn parameters_schema(&self) -> serde_json::Value { + json!({ + "type": "object", + "additionalProperties": false, + "properties": { + "path": { + "type": "string", + "description": "Path to the audio file to transcribe" + }, + "model": { + "type": "string", + "enum": ["tiny", "base", "small", "medium", "large-v2", "large-v3"], + "description": "Whisper model size (default: base). Larger models are more accurate but slower." + }, + "language": { + "type": "string", + "description": "Hint for the spoken language (e.g., 'en', 'es', 'zh'). Optional." + } + }, + "required": ["path"] + }) + } + + async fn execute(&self, args: serde_json::Value) -> anyhow::Result { + let path_str = args + .get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?; + + if self.security.is_rate_limited() { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some("Rate limit exceeded: too many actions in the last hour".into()), + }); + } + + if !self.security.is_path_allowed(path_str) { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Path not allowed by security policy: {}", path_str)), + }); + } + + if !self.security.record_action() { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some("Rate limit exceeded: action budget exhausted".into()), + }); + } + + let path = Path::new(path_str); + let full_path = self.security.workspace_dir.join(path); + + if !full_path.exists() { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("File not found: {}", path_str)), + }); + } + + let metadata = match std::fs::metadata(&full_path) { + Ok(m) => m, + Err(e) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Cannot read file metadata: {}", e)), + }); + } + }; + + if metadata.len() > MAX_AUDIO_BYTES { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!( + "File too large: {} bytes (max: {} bytes)", + metadata.len(), + MAX_AUDIO_BYTES + )), + }); + } + + if !Self::is_supported_format(&full_path) { + let ext = full_path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("unknown"); + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!( + "Unsupported audio format: {}. Supported: {}", + ext, + SUPPORTED_FORMATS.join(", ") + )), + }); + } + + let model = args + .get("model") + .and_then(|v| v.as_str()) + .unwrap_or(&self.model); + + let script = Self::transcription_script(); + let output = Command::new("python3") + .arg("-c") + .arg(script) + .arg(&full_path) + .arg(model) + .arg(&self.device) + .output() + .await; + + match output { + Ok(result) => { + if result.status.success() { + let stdout = String::from_utf8_lossy(&result.stdout); + match serde_json::from_str::(&stdout) { + Ok(json) => { + let text = json + .get("text") + .and_then(|t| t.as_str()) + .unwrap_or(&stdout); + let language = json + .get("language") + .and_then(|l| l.as_str()) + .unwrap_or("unknown"); + let duration = json + .get("duration") + .and_then(|d| d.as_f64()) + .unwrap_or(0.0); + + let duration_f = duration; + let mut out = format!( + "**Transcription** ({:.1}, language: {})\n\n{}\n", + duration_f, language, text + ); + + if let Some(segments) = json.get("segments").and_then(|s| s.as_array()) + { + if segments.len() > 1 { + out.push_str("\n**Segments:**\n"); + for seg in segments.iter().take(20) { + if let (Some(start), Some(end), Some(seg_text)) = ( + seg.get("start").and_then(|v| v.as_f64()), + seg.get("end").and_then(|v| v.as_f64()), + seg.get("text").and_then(|v| v.as_str()), + ) { + let start_f = start; + let end_f = end; + out.push_str(&format!( + "[{:05.1} - {:05.1}] {}\n", + start_f, end_f, seg_text + )); + } + } + if segments.len() > 20 { + out.push_str(&format!( + "... and {} more segments\n", + segments.len() - 20 + )); + } + } + } + + Ok(ToolResult { + success: true, + output: out, + error: None, + }) + } + Err(_) => Ok(ToolResult { + success: true, + output: stdout.to_string(), + error: None, + }), + } + } else { + let stderr = String::from_utf8_lossy(&result.stderr); + Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Transcription failed: {}", stderr.trim())), + }) + } + } + Err(e) => Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Failed to run transcription: {}", e)), + }), + } + } +}