feat: add homelab configuration and transcription support

- Add host-based delegate agents (ubuntu, grizzley, truenas, panda, pve) - Add functional delegates (coder, reasoner, research, quick) - Add NanoGPT provider with minimax-m2.5 model - Add transcribe tool using faster-whisper - Update TelegramChannel with workspace_dir - Configure Z.AI as default provider with glm-5
2026-02-18 01:55:33 +00:00
parent a2f29838b4
commit 5d9c716a72
6 changed files with 300 additions and 3 deletions
--- a/src/channels/mod.rs
+++ b/src/channels/mod.rs
@@ -633,6 +633,7 @@ pub async fn doctor_channels(config: Config) -> Result<()> {
            Arc::new(TelegramChannel::new(
                tg.bot_token.clone(),
                tg.allowed_users.clone(),
+                Some(config.workspace_dir.clone()),
            )),
        ));
    }
@@ -923,6 +924,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
        channels.push(Arc::new(TelegramChannel::new(
            tg.bot_token.clone(),
            tg.allowed_users.clone(),
+                Some(config.workspace_dir.clone()),
        )));
    }

--- a/src/channels/telegram.rs
+++ b/src/channels/telegram.rs
@@ -179,18 +179,19 @@ fn parse_attachment_markers(message: &str) -> (String, Vec<TelegramAttachment>)
 }

 /// Telegram channel — long-polls the Bot API for updates
-pub struct TelegramChannel {
+pub struct TelegramChannel { workspace_dir: std::path::PathBuf,
    bot_token: String,
    allowed_users: Vec<String>,
    client: reqwest::Client,
 }

 impl TelegramChannel {
-    pub fn new(bot_token: String, allowed_users: Vec<String>) -> Self {
+    pub fn new(bot_token: String, allowed_users: Vec<String>, workspace_dir: Option<std::path::PathBuf>) -> Self {
        Self {
            bot_token,
            allowed_users,
            client: reqwest::Client::new(),
+            workspace_dir: workspace_dir.unwrap_or_else(|| std::path::PathBuf::from("/tmp")),
        }
    }

--- a/src/cron/scheduler.rs
+++ b/src/cron/scheduler.rs
@@ -231,7 +231,7 @@ async fn deliver_if_configured(config: &Config, job: &CronJob, output: &str) ->
                .telegram
                .as_ref()
                .ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?;
-            let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone());
+            let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone(), Some(config.workspace_dir.clone()));
            channel.send(output, target).await?;
        }
        "discord" => {
--- a/src/providers/mod.rs
+++ b/src/providers/mod.rs
@@ -135,6 +135,7 @@ fn resolve_provider_credential(name: &str, credential_override: Option<&str>) ->
        "zai" | "z.ai" => vec!["ZAI_API_KEY"],
        "nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"],
        "synthetic" => vec!["SYNTHETIC_API_KEY"],
+        "nanogpt" | "nano-gpt" => vec!["NANO_GPT_API_KEY"],
        "opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"],
        "vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"],
        "cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"],
@@ -246,6 +247,12 @@ pub fn create_provider_with_url(
            key,
            AuthStyle::Bearer,
        ))),
+        "nanogpt" | "nano-gpt" => Ok(Box::new(OpenAiCompatibleProvider::new(
+            "NanoGPT",
+            "https://nano-gpt.com/api/v1",
+            key,
+            AuthStyle::Bearer,
+        ))),
        "bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new(
            "Amazon Bedrock",
            "https://bedrock-runtime.us-east-1.amazonaws.com",
--- a/src/tools/mod.rs
+++ b/src/tools/mod.rs
@@ -25,6 +25,7 @@ pub mod schema;
 pub mod screenshot;
 pub mod shell;
 pub mod traits;
+pub mod transcribe;

 pub use browser::{BrowserTool, ComputerUseConfig};
 pub use browser_open::BrowserOpenTool;
@@ -53,6 +54,7 @@ pub use schema::{CleaningStrategy, SchemaCleanr};
 pub use screenshot::ScreenshotTool;
 pub use shell::ShellTool;
 pub use traits::Tool;
+pub use transcribe::TranscribeTool;
 #[allow(unused_imports)]
 pub use traits::{ToolResult, ToolSpec};

@@ -191,6 +193,8 @@ pub fn all_tools_with_runtime(
    tools.push(Box::new(ScreenshotTool::new(security.clone())));
    tools.push(Box::new(ImageInfoTool::new(security.clone())));

+    tools.push(Box::new(TranscribeTool::new(security.clone(), None, None)));
+
    if let Some(key) = composio_key {
        if !key.is_empty() {
            tools.push(Box::new(ComposioTool::new(key, composio_entity_id)));
--- a/src/tools/transcribe.rs
+++ b/src/tools/transcribe.rs
@@ -0,0 +1,283 @@
+use super::traits::{Tool, ToolResult};
+use crate::security::SecurityPolicy;
+use async_trait::async_trait;
+use serde_json::json;
+use std::path::Path;
+use std::sync::Arc;
+use tokio::process::Command;
+
+const MAX_AUDIO_BYTES: u64 = 104_857_600;
+const SUPPORTED_FORMATS: &[&str] = &["mp3", "wav", "m4a", "flac", "ogg", "webm", "mp4", "mpeg", "mpga"];
+
+pub struct TranscribeTool {
+    security: Arc<SecurityPolicy>,
+    model: String,
+    device: String,
+}
+
+impl TranscribeTool {
+    pub fn new(security: Arc<SecurityPolicy>, model: Option<String>, device: Option<String>) -> Self {
+        Self {
+            security,
+            model: model.unwrap_or_else(|| "base".to_string()),
+            device: device.unwrap_or_else(|| "cpu".to_string()),
+        }
+    }
+
+    fn is_supported_format(path: &Path) -> bool {
+        path.extension()
+            .and_then(|ext| ext.to_str())
+            .map(|ext| SUPPORTED_FORMATS.contains(&ext.to_lowercase().as_str()))
+            .unwrap_or(false)
+    }
+
+    fn transcription_script() -> &'static str {
+        r#"
+import sys
+import json
+
+def transcribe(audio_path, model_size, device):
+    from faster_whisper import WhisperModel
+    
+    model = WhisperModel(model_size, device=device, compute_type="int8")
+    segments, info = model.transcribe(audio_path, beam_size=5)
+    
+    transcription = []
+    for segment in segments:
+        transcription.append({
+            "start": round(segment.start, 2),
+            "end": round(segment.end, 2),
+            "text": segment.text.strip()
+        })
+    
+    result = {
+        "language": info.language,
+        "language_probability": round(info.language_probability, 2),
+        "duration": round(info.duration, 2),
+        "segments": transcription,
+        "text": " ".join(s["text"] for s in transcription)
+    }
+    
+    print(json.dumps(result))
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print(json.dumps({"error": "Usage: script.py <audio_path> <model> <device>"}))
+        sys.exit(1)
+    
+    transcribe(sys.argv[1], sys.argv[2], sys.argv[3])
+"#
+    }
+}
+
+#[async_trait]
+impl Tool for TranscribeTool {
+    fn name(&self) -> &str {
+        "transcribe"
+    }
+
+    fn description(&self) -> &str {
+        "Transcribe audio files to text using faster-whisper. \
+         Supports mp3, wav, m4a, flac, ogg, webm, and other common audio formats. \
+         Returns the transcription with timestamps and detected language."
+    }
+
+    fn parameters_schema(&self) -> serde_json::Value {
+        json!({
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Path to the audio file to transcribe"
+                },
+                "model": {
+                    "type": "string",
+                    "enum": ["tiny", "base", "small", "medium", "large-v2", "large-v3"],
+                    "description": "Whisper model size (default: base). Larger models are more accurate but slower."
+                },
+                "language": {
+                    "type": "string",
+                    "description": "Hint for the spoken language (e.g., 'en', 'es', 'zh'). Optional."
+                }
+            },
+            "required": ["path"]
+        })
+    }
+
+    async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
+        let path_str = args
+            .get("path")
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?;
+
+        if self.security.is_rate_limited() {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some("Rate limit exceeded: too many actions in the last hour".into()),
+            });
+        }
+
+        if !self.security.is_path_allowed(path_str) {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!("Path not allowed by security policy: {}", path_str)),
+            });
+        }
+
+        if !self.security.record_action() {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some("Rate limit exceeded: action budget exhausted".into()),
+            });
+        }
+
+        let path = Path::new(path_str);
+        let full_path = self.security.workspace_dir.join(path);
+
+        if !full_path.exists() {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!("File not found: {}", path_str)),
+            });
+        }
+
+        let metadata = match std::fs::metadata(&full_path) {
+            Ok(m) => m,
+            Err(e) => {
+                return Ok(ToolResult {
+                    success: false,
+                    output: String::new(),
+                    error: Some(format!("Cannot read file metadata: {}", e)),
+                });
+            }
+        };
+
+        if metadata.len() > MAX_AUDIO_BYTES {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!(
+                    "File too large: {} bytes (max: {} bytes)",
+                    metadata.len(),
+                    MAX_AUDIO_BYTES
+                )),
+            });
+        }
+
+        if !Self::is_supported_format(&full_path) {
+            let ext = full_path
+                .extension()
+                .and_then(|e| e.to_str())
+                .unwrap_or("unknown");
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!(
+                    "Unsupported audio format: {}. Supported: {}",
+                    ext,
+                    SUPPORTED_FORMATS.join(", ")
+                )),
+            });
+        }
+
+        let model = args
+            .get("model")
+            .and_then(|v| v.as_str())
+            .unwrap_or(&self.model);
+
+        let script = Self::transcription_script();
+        let output = Command::new("python3")
+            .arg("-c")
+            .arg(script)
+            .arg(&full_path)
+            .arg(model)
+            .arg(&self.device)
+            .output()
+            .await;
+
+        match output {
+            Ok(result) => {
+                if result.status.success() {
+                    let stdout = String::from_utf8_lossy(&result.stdout);
+                    match serde_json::from_str::<serde_json::Value>(&stdout) {
+                        Ok(json) => {
+                            let text = json
+                                .get("text")
+                                .and_then(|t| t.as_str())
+                                .unwrap_or(&stdout);
+                            let language = json
+                                .get("language")
+                                .and_then(|l| l.as_str())
+                                .unwrap_or("unknown");
+                            let duration = json
+                                .get("duration")
+                                .and_then(|d| d.as_f64())
+                                .unwrap_or(0.0);
+
+                            let duration_f = duration;
+                            let mut out = format!(
+                                "**Transcription** ({:.1}, language: {})\n\n{}\n",
+                                duration_f, language, text
+                            );
+
+                            if let Some(segments) = json.get("segments").and_then(|s| s.as_array())
+                            {
+                                if segments.len() > 1 {
+                                    out.push_str("\n**Segments:**\n");
+                                    for seg in segments.iter().take(20) {
+                                        if let (Some(start), Some(end), Some(seg_text)) = (
+                                            seg.get("start").and_then(|v| v.as_f64()),
+                                            seg.get("end").and_then(|v| v.as_f64()),
+                                            seg.get("text").and_then(|v| v.as_str()),
+                                        ) {
+                                            let start_f = start;
+                                            let end_f = end;
+                                            out.push_str(&format!(
+                                                "[{:05.1} - {:05.1}] {}\n",
+                                                start_f, end_f, seg_text
+                                            ));
+                                        }
+                                    }
+                                    if segments.len() > 20 {
+                                        out.push_str(&format!(
+                                            "... and {} more segments\n",
+                                            segments.len() - 20
+                                        ));
+                                    }
+                                }
+                            }
+
+                            Ok(ToolResult {
+                                success: true,
+                                output: out,
+                                error: None,
+                            })
+                        }
+                        Err(_) => Ok(ToolResult {
+                            success: true,
+                            output: stdout.to_string(),
+                            error: None,
+                        }),
+                    }
+                } else {
+                    let stderr = String::from_utf8_lossy(&result.stderr);
+                    Ok(ToolResult {
+                        success: false,
+                        output: String::new(),
+                        error: Some(format!("Transcription failed: {}", stderr.trim())),
+                    })
+                }
+            }
+            Err(e) => Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!("Failed to run transcription: {}", e)),
+            }),
+        }
+    }
+}