feat: add homelab configuration and transcription support
Some checks failed
CI / Detect Change Scope (push) Has been cancelled
CI / Format & Lint (push) Has been cancelled
CI / Lint Strict Delta (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (Smoke) (push) Has been cancelled
CI / Docs-Only Fast Path (push) Has been cancelled
CI / Non-Rust Fast Path (push) Has been cancelled
CI / Docs Quality (push) Has been cancelled
CI / CI Required Gate (push) Has been cancelled
Docker / PR Docker Smoke (push) Has been cancelled
Docker / Build and Push Docker Image (push) Has been cancelled
Rust Package Security Audit / Security Audit (push) Has been cancelled
Rust Package Security Audit / License & Supply Chain (push) Has been cancelled

- Add host-based delegate agents (ubuntu, grizzley, truenas, panda, pve)
- Add functional delegates (coder, reasoner, research, quick)
- Add NanoGPT provider with minimax-m2.5 model
- Add transcribe tool using faster-whisper
- Update TelegramChannel with workspace_dir
- Configure Z.AI as default provider with glm-5
This commit is contained in:
2026-02-18 01:55:33 +00:00
parent a2f29838b4
commit 5d9c716a72
6 changed files with 300 additions and 3 deletions

View File

@@ -633,6 +633,7 @@ pub async fn doctor_channels(config: Config) -> Result<()> {
Arc::new(TelegramChannel::new( Arc::new(TelegramChannel::new(
tg.bot_token.clone(), tg.bot_token.clone(),
tg.allowed_users.clone(), tg.allowed_users.clone(),
Some(config.workspace_dir.clone()),
)), )),
)); ));
} }
@@ -923,6 +924,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
channels.push(Arc::new(TelegramChannel::new( channels.push(Arc::new(TelegramChannel::new(
tg.bot_token.clone(), tg.bot_token.clone(),
tg.allowed_users.clone(), tg.allowed_users.clone(),
Some(config.workspace_dir.clone()),
))); )));
} }

View File

@@ -179,18 +179,19 @@ fn parse_attachment_markers(message: &str) -> (String, Vec<TelegramAttachment>)
} }
/// Telegram channel — long-polls the Bot API for updates /// Telegram channel — long-polls the Bot API for updates
pub struct TelegramChannel { pub struct TelegramChannel { workspace_dir: std::path::PathBuf,
bot_token: String, bot_token: String,
allowed_users: Vec<String>, allowed_users: Vec<String>,
client: reqwest::Client, client: reqwest::Client,
} }
impl TelegramChannel { impl TelegramChannel {
pub fn new(bot_token: String, allowed_users: Vec<String>) -> Self { pub fn new(bot_token: String, allowed_users: Vec<String>, workspace_dir: Option<std::path::PathBuf>) -> Self {
Self { Self {
bot_token, bot_token,
allowed_users, allowed_users,
client: reqwest::Client::new(), client: reqwest::Client::new(),
workspace_dir: workspace_dir.unwrap_or_else(|| std::path::PathBuf::from("/tmp")),
} }
} }

View File

@@ -231,7 +231,7 @@ async fn deliver_if_configured(config: &Config, job: &CronJob, output: &str) ->
.telegram .telegram
.as_ref() .as_ref()
.ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?; .ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?;
let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone()); let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone(), Some(config.workspace_dir.clone()));
channel.send(output, target).await?; channel.send(output, target).await?;
} }
"discord" => { "discord" => {

View File

@@ -135,6 +135,7 @@ fn resolve_provider_credential(name: &str, credential_override: Option<&str>) ->
"zai" | "z.ai" => vec!["ZAI_API_KEY"], "zai" | "z.ai" => vec!["ZAI_API_KEY"],
"nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"], "nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"],
"synthetic" => vec!["SYNTHETIC_API_KEY"], "synthetic" => vec!["SYNTHETIC_API_KEY"],
"nanogpt" | "nano-gpt" => vec!["NANO_GPT_API_KEY"],
"opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"], "opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"],
"vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"], "vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"],
"cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"], "cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"],
@@ -246,6 +247,12 @@ pub fn create_provider_with_url(
key, key,
AuthStyle::Bearer, AuthStyle::Bearer,
))), ))),
"nanogpt" | "nano-gpt" => Ok(Box::new(OpenAiCompatibleProvider::new(
"NanoGPT",
"https://nano-gpt.com/api/v1",
key,
AuthStyle::Bearer,
))),
"bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new( "bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new(
"Amazon Bedrock", "Amazon Bedrock",
"https://bedrock-runtime.us-east-1.amazonaws.com", "https://bedrock-runtime.us-east-1.amazonaws.com",

View File

@@ -25,6 +25,7 @@ pub mod schema;
pub mod screenshot; pub mod screenshot;
pub mod shell; pub mod shell;
pub mod traits; pub mod traits;
pub mod transcribe;
pub use browser::{BrowserTool, ComputerUseConfig}; pub use browser::{BrowserTool, ComputerUseConfig};
pub use browser_open::BrowserOpenTool; pub use browser_open::BrowserOpenTool;
@@ -53,6 +54,7 @@ pub use schema::{CleaningStrategy, SchemaCleanr};
pub use screenshot::ScreenshotTool; pub use screenshot::ScreenshotTool;
pub use shell::ShellTool; pub use shell::ShellTool;
pub use traits::Tool; pub use traits::Tool;
pub use transcribe::TranscribeTool;
#[allow(unused_imports)] #[allow(unused_imports)]
pub use traits::{ToolResult, ToolSpec}; pub use traits::{ToolResult, ToolSpec};
@@ -191,6 +193,8 @@ pub fn all_tools_with_runtime(
tools.push(Box::new(ScreenshotTool::new(security.clone()))); tools.push(Box::new(ScreenshotTool::new(security.clone())));
tools.push(Box::new(ImageInfoTool::new(security.clone()))); tools.push(Box::new(ImageInfoTool::new(security.clone())));
tools.push(Box::new(TranscribeTool::new(security.clone(), None, None)));
if let Some(key) = composio_key { if let Some(key) = composio_key {
if !key.is_empty() { if !key.is_empty() {
tools.push(Box::new(ComposioTool::new(key, composio_entity_id))); tools.push(Box::new(ComposioTool::new(key, composio_entity_id)));

283
src/tools/transcribe.rs Normal file
View File

@@ -0,0 +1,283 @@
use super::traits::{Tool, ToolResult};
use crate::security::SecurityPolicy;
use async_trait::async_trait;
use serde_json::json;
use std::path::Path;
use std::sync::Arc;
use tokio::process::Command;
const MAX_AUDIO_BYTES: u64 = 104_857_600;
const SUPPORTED_FORMATS: &[&str] = &["mp3", "wav", "m4a", "flac", "ogg", "webm", "mp4", "mpeg", "mpga"];
pub struct TranscribeTool {
security: Arc<SecurityPolicy>,
model: String,
device: String,
}
impl TranscribeTool {
pub fn new(security: Arc<SecurityPolicy>, model: Option<String>, device: Option<String>) -> Self {
Self {
security,
model: model.unwrap_or_else(|| "base".to_string()),
device: device.unwrap_or_else(|| "cpu".to_string()),
}
}
fn is_supported_format(path: &Path) -> bool {
path.extension()
.and_then(|ext| ext.to_str())
.map(|ext| SUPPORTED_FORMATS.contains(&ext.to_lowercase().as_str()))
.unwrap_or(false)
}
fn transcription_script() -> &'static str {
r#"
import sys
import json
def transcribe(audio_path, model_size, device):
from faster_whisper import WhisperModel
model = WhisperModel(model_size, device=device, compute_type="int8")
segments, info = model.transcribe(audio_path, beam_size=5)
transcription = []
for segment in segments:
transcription.append({
"start": round(segment.start, 2),
"end": round(segment.end, 2),
"text": segment.text.strip()
})
result = {
"language": info.language,
"language_probability": round(info.language_probability, 2),
"duration": round(info.duration, 2),
"segments": transcription,
"text": " ".join(s["text"] for s in transcription)
}
print(json.dumps(result))
if __name__ == "__main__":
if len(sys.argv) != 4:
print(json.dumps({"error": "Usage: script.py <audio_path> <model> <device>"}))
sys.exit(1)
transcribe(sys.argv[1], sys.argv[2], sys.argv[3])
"#
}
}
#[async_trait]
impl Tool for TranscribeTool {
fn name(&self) -> &str {
"transcribe"
}
fn description(&self) -> &str {
"Transcribe audio files to text using faster-whisper. \
Supports mp3, wav, m4a, flac, ogg, webm, and other common audio formats. \
Returns the transcription with timestamps and detected language."
}
fn parameters_schema(&self) -> serde_json::Value {
json!({
"type": "object",
"additionalProperties": false,
"properties": {
"path": {
"type": "string",
"description": "Path to the audio file to transcribe"
},
"model": {
"type": "string",
"enum": ["tiny", "base", "small", "medium", "large-v2", "large-v3"],
"description": "Whisper model size (default: base). Larger models are more accurate but slower."
},
"language": {
"type": "string",
"description": "Hint for the spoken language (e.g., 'en', 'es', 'zh'). Optional."
}
},
"required": ["path"]
})
}
async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
let path_str = args
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?;
if self.security.is_rate_limited() {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some("Rate limit exceeded: too many actions in the last hour".into()),
});
}
if !self.security.is_path_allowed(path_str) {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Path not allowed by security policy: {}", path_str)),
});
}
if !self.security.record_action() {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some("Rate limit exceeded: action budget exhausted".into()),
});
}
let path = Path::new(path_str);
let full_path = self.security.workspace_dir.join(path);
if !full_path.exists() {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("File not found: {}", path_str)),
});
}
let metadata = match std::fs::metadata(&full_path) {
Ok(m) => m,
Err(e) => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Cannot read file metadata: {}", e)),
});
}
};
if metadata.len() > MAX_AUDIO_BYTES {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!(
"File too large: {} bytes (max: {} bytes)",
metadata.len(),
MAX_AUDIO_BYTES
)),
});
}
if !Self::is_supported_format(&full_path) {
let ext = full_path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("unknown");
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!(
"Unsupported audio format: {}. Supported: {}",
ext,
SUPPORTED_FORMATS.join(", ")
)),
});
}
let model = args
.get("model")
.and_then(|v| v.as_str())
.unwrap_or(&self.model);
let script = Self::transcription_script();
let output = Command::new("python3")
.arg("-c")
.arg(script)
.arg(&full_path)
.arg(model)
.arg(&self.device)
.output()
.await;
match output {
Ok(result) => {
if result.status.success() {
let stdout = String::from_utf8_lossy(&result.stdout);
match serde_json::from_str::<serde_json::Value>(&stdout) {
Ok(json) => {
let text = json
.get("text")
.and_then(|t| t.as_str())
.unwrap_or(&stdout);
let language = json
.get("language")
.and_then(|l| l.as_str())
.unwrap_or("unknown");
let duration = json
.get("duration")
.and_then(|d| d.as_f64())
.unwrap_or(0.0);
let duration_f = duration;
let mut out = format!(
"**Transcription** ({:.1}, language: {})\n\n{}\n",
duration_f, language, text
);
if let Some(segments) = json.get("segments").and_then(|s| s.as_array())
{
if segments.len() > 1 {
out.push_str("\n**Segments:**\n");
for seg in segments.iter().take(20) {
if let (Some(start), Some(end), Some(seg_text)) = (
seg.get("start").and_then(|v| v.as_f64()),
seg.get("end").and_then(|v| v.as_f64()),
seg.get("text").and_then(|v| v.as_str()),
) {
let start_f = start;
let end_f = end;
out.push_str(&format!(
"[{:05.1} - {:05.1}] {}\n",
start_f, end_f, seg_text
));
}
}
if segments.len() > 20 {
out.push_str(&format!(
"... and {} more segments\n",
segments.len() - 20
));
}
}
}
Ok(ToolResult {
success: true,
output: out,
error: None,
})
}
Err(_) => Ok(ToolResult {
success: true,
output: stdout.to_string(),
error: None,
}),
}
} else {
let stderr = String::from_utf8_lossy(&result.stderr);
Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Transcription failed: {}", stderr.trim())),
})
}
}
Err(e) => Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Failed to run transcription: {}", e)),
}),
}
}
}