feat: add homelab configuration and transcription support
Some checks failed
CI / Detect Change Scope (push) Has been cancelled
CI / Format & Lint (push) Has been cancelled
CI / Lint Strict Delta (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (Smoke) (push) Has been cancelled
CI / Docs-Only Fast Path (push) Has been cancelled
CI / Non-Rust Fast Path (push) Has been cancelled
CI / Docs Quality (push) Has been cancelled
CI / CI Required Gate (push) Has been cancelled
Docker / PR Docker Smoke (push) Has been cancelled
Docker / Build and Push Docker Image (push) Has been cancelled
Rust Package Security Audit / Security Audit (push) Has been cancelled
Rust Package Security Audit / License & Supply Chain (push) Has been cancelled
Some checks failed
CI / Detect Change Scope (push) Has been cancelled
CI / Format & Lint (push) Has been cancelled
CI / Lint Strict Delta (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (Smoke) (push) Has been cancelled
CI / Docs-Only Fast Path (push) Has been cancelled
CI / Non-Rust Fast Path (push) Has been cancelled
CI / Docs Quality (push) Has been cancelled
CI / CI Required Gate (push) Has been cancelled
Docker / PR Docker Smoke (push) Has been cancelled
Docker / Build and Push Docker Image (push) Has been cancelled
Rust Package Security Audit / Security Audit (push) Has been cancelled
Rust Package Security Audit / License & Supply Chain (push) Has been cancelled
- Add host-based delegate agents (ubuntu, grizzley, truenas, panda, pve) - Add functional delegates (coder, reasoner, research, quick) - Add NanoGPT provider with minimax-m2.5 model - Add transcribe tool using faster-whisper - Update TelegramChannel with workspace_dir - Configure Z.AI as default provider with glm-5
This commit is contained in:
@@ -633,6 +633,7 @@ pub async fn doctor_channels(config: Config) -> Result<()> {
|
||||
Arc::new(TelegramChannel::new(
|
||||
tg.bot_token.clone(),
|
||||
tg.allowed_users.clone(),
|
||||
Some(config.workspace_dir.clone()),
|
||||
)),
|
||||
));
|
||||
}
|
||||
@@ -923,6 +924,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
|
||||
channels.push(Arc::new(TelegramChannel::new(
|
||||
tg.bot_token.clone(),
|
||||
tg.allowed_users.clone(),
|
||||
Some(config.workspace_dir.clone()),
|
||||
)));
|
||||
}
|
||||
|
||||
|
||||
@@ -179,18 +179,19 @@ fn parse_attachment_markers(message: &str) -> (String, Vec<TelegramAttachment>)
|
||||
}
|
||||
|
||||
/// Telegram channel — long-polls the Bot API for updates
|
||||
pub struct TelegramChannel {
|
||||
pub struct TelegramChannel { workspace_dir: std::path::PathBuf,
|
||||
bot_token: String,
|
||||
allowed_users: Vec<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl TelegramChannel {
|
||||
pub fn new(bot_token: String, allowed_users: Vec<String>) -> Self {
|
||||
pub fn new(bot_token: String, allowed_users: Vec<String>, workspace_dir: Option<std::path::PathBuf>) -> Self {
|
||||
Self {
|
||||
bot_token,
|
||||
allowed_users,
|
||||
client: reqwest::Client::new(),
|
||||
workspace_dir: workspace_dir.unwrap_or_else(|| std::path::PathBuf::from("/tmp")),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -231,7 +231,7 @@ async fn deliver_if_configured(config: &Config, job: &CronJob, output: &str) ->
|
||||
.telegram
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?;
|
||||
let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone());
|
||||
let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone(), Some(config.workspace_dir.clone()));
|
||||
channel.send(output, target).await?;
|
||||
}
|
||||
"discord" => {
|
||||
|
||||
@@ -135,6 +135,7 @@ fn resolve_provider_credential(name: &str, credential_override: Option<&str>) ->
|
||||
"zai" | "z.ai" => vec!["ZAI_API_KEY"],
|
||||
"nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"],
|
||||
"synthetic" => vec!["SYNTHETIC_API_KEY"],
|
||||
"nanogpt" | "nano-gpt" => vec!["NANO_GPT_API_KEY"],
|
||||
"opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"],
|
||||
"vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"],
|
||||
"cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"],
|
||||
@@ -246,6 +247,12 @@ pub fn create_provider_with_url(
|
||||
key,
|
||||
AuthStyle::Bearer,
|
||||
))),
|
||||
"nanogpt" | "nano-gpt" => Ok(Box::new(OpenAiCompatibleProvider::new(
|
||||
"NanoGPT",
|
||||
"https://nano-gpt.com/api/v1",
|
||||
key,
|
||||
AuthStyle::Bearer,
|
||||
))),
|
||||
"bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new(
|
||||
"Amazon Bedrock",
|
||||
"https://bedrock-runtime.us-east-1.amazonaws.com",
|
||||
|
||||
@@ -25,6 +25,7 @@ pub mod schema;
|
||||
pub mod screenshot;
|
||||
pub mod shell;
|
||||
pub mod traits;
|
||||
pub mod transcribe;
|
||||
|
||||
pub use browser::{BrowserTool, ComputerUseConfig};
|
||||
pub use browser_open::BrowserOpenTool;
|
||||
@@ -53,6 +54,7 @@ pub use schema::{CleaningStrategy, SchemaCleanr};
|
||||
pub use screenshot::ScreenshotTool;
|
||||
pub use shell::ShellTool;
|
||||
pub use traits::Tool;
|
||||
pub use transcribe::TranscribeTool;
|
||||
#[allow(unused_imports)]
|
||||
pub use traits::{ToolResult, ToolSpec};
|
||||
|
||||
@@ -191,6 +193,8 @@ pub fn all_tools_with_runtime(
|
||||
tools.push(Box::new(ScreenshotTool::new(security.clone())));
|
||||
tools.push(Box::new(ImageInfoTool::new(security.clone())));
|
||||
|
||||
tools.push(Box::new(TranscribeTool::new(security.clone(), None, None)));
|
||||
|
||||
if let Some(key) = composio_key {
|
||||
if !key.is_empty() {
|
||||
tools.push(Box::new(ComposioTool::new(key, composio_entity_id)));
|
||||
|
||||
283
src/tools/transcribe.rs
Normal file
283
src/tools/transcribe.rs
Normal file
@@ -0,0 +1,283 @@
|
||||
use super::traits::{Tool, ToolResult};
|
||||
use crate::security::SecurityPolicy;
|
||||
use async_trait::async_trait;
|
||||
use serde_json::json;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokio::process::Command;
|
||||
|
||||
const MAX_AUDIO_BYTES: u64 = 104_857_600;
|
||||
const SUPPORTED_FORMATS: &[&str] = &["mp3", "wav", "m4a", "flac", "ogg", "webm", "mp4", "mpeg", "mpga"];
|
||||
|
||||
pub struct TranscribeTool {
|
||||
security: Arc<SecurityPolicy>,
|
||||
model: String,
|
||||
device: String,
|
||||
}
|
||||
|
||||
impl TranscribeTool {
|
||||
pub fn new(security: Arc<SecurityPolicy>, model: Option<String>, device: Option<String>) -> Self {
|
||||
Self {
|
||||
security,
|
||||
model: model.unwrap_or_else(|| "base".to_string()),
|
||||
device: device.unwrap_or_else(|| "cpu".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_supported_format(path: &Path) -> bool {
|
||||
path.extension()
|
||||
.and_then(|ext| ext.to_str())
|
||||
.map(|ext| SUPPORTED_FORMATS.contains(&ext.to_lowercase().as_str()))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn transcription_script() -> &'static str {
|
||||
r#"
|
||||
import sys
|
||||
import json
|
||||
|
||||
def transcribe(audio_path, model_size, device):
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel(model_size, device=device, compute_type="int8")
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
transcription = []
|
||||
for segment in segments:
|
||||
transcription.append({
|
||||
"start": round(segment.start, 2),
|
||||
"end": round(segment.end, 2),
|
||||
"text": segment.text.strip()
|
||||
})
|
||||
|
||||
result = {
|
||||
"language": info.language,
|
||||
"language_probability": round(info.language_probability, 2),
|
||||
"duration": round(info.duration, 2),
|
||||
"segments": transcription,
|
||||
"text": " ".join(s["text"] for s in transcription)
|
||||
}
|
||||
|
||||
print(json.dumps(result))
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 4:
|
||||
print(json.dumps({"error": "Usage: script.py <audio_path> <model> <device>"}))
|
||||
sys.exit(1)
|
||||
|
||||
transcribe(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
"#
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Tool for TranscribeTool {
|
||||
fn name(&self) -> &str {
|
||||
"transcribe"
|
||||
}
|
||||
|
||||
fn description(&self) -> &str {
|
||||
"Transcribe audio files to text using faster-whisper. \
|
||||
Supports mp3, wav, m4a, flac, ogg, webm, and other common audio formats. \
|
||||
Returns the transcription with timestamps and detected language."
|
||||
}
|
||||
|
||||
fn parameters_schema(&self) -> serde_json::Value {
|
||||
json!({
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path to the audio file to transcribe"
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"enum": ["tiny", "base", "small", "medium", "large-v2", "large-v3"],
|
||||
"description": "Whisper model size (default: base). Larger models are more accurate but slower."
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"description": "Hint for the spoken language (e.g., 'en', 'es', 'zh'). Optional."
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
})
|
||||
}
|
||||
|
||||
async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
|
||||
let path_str = args
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?;
|
||||
|
||||
if self.security.is_rate_limited() {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some("Rate limit exceeded: too many actions in the last hour".into()),
|
||||
});
|
||||
}
|
||||
|
||||
if !self.security.is_path_allowed(path_str) {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Path not allowed by security policy: {}", path_str)),
|
||||
});
|
||||
}
|
||||
|
||||
if !self.security.record_action() {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some("Rate limit exceeded: action budget exhausted".into()),
|
||||
});
|
||||
}
|
||||
|
||||
let path = Path::new(path_str);
|
||||
let full_path = self.security.workspace_dir.join(path);
|
||||
|
||||
if !full_path.exists() {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("File not found: {}", path_str)),
|
||||
});
|
||||
}
|
||||
|
||||
let metadata = match std::fs::metadata(&full_path) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Cannot read file metadata: {}", e)),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
if metadata.len() > MAX_AUDIO_BYTES {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!(
|
||||
"File too large: {} bytes (max: {} bytes)",
|
||||
metadata.len(),
|
||||
MAX_AUDIO_BYTES
|
||||
)),
|
||||
});
|
||||
}
|
||||
|
||||
if !Self::is_supported_format(&full_path) {
|
||||
let ext = full_path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or("unknown");
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!(
|
||||
"Unsupported audio format: {}. Supported: {}",
|
||||
ext,
|
||||
SUPPORTED_FORMATS.join(", ")
|
||||
)),
|
||||
});
|
||||
}
|
||||
|
||||
let model = args
|
||||
.get("model")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or(&self.model);
|
||||
|
||||
let script = Self::transcription_script();
|
||||
let output = Command::new("python3")
|
||||
.arg("-c")
|
||||
.arg(script)
|
||||
.arg(&full_path)
|
||||
.arg(model)
|
||||
.arg(&self.device)
|
||||
.output()
|
||||
.await;
|
||||
|
||||
match output {
|
||||
Ok(result) => {
|
||||
if result.status.success() {
|
||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||
match serde_json::from_str::<serde_json::Value>(&stdout) {
|
||||
Ok(json) => {
|
||||
let text = json
|
||||
.get("text")
|
||||
.and_then(|t| t.as_str())
|
||||
.unwrap_or(&stdout);
|
||||
let language = json
|
||||
.get("language")
|
||||
.and_then(|l| l.as_str())
|
||||
.unwrap_or("unknown");
|
||||
let duration = json
|
||||
.get("duration")
|
||||
.and_then(|d| d.as_f64())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
let duration_f = duration;
|
||||
let mut out = format!(
|
||||
"**Transcription** ({:.1}, language: {})\n\n{}\n",
|
||||
duration_f, language, text
|
||||
);
|
||||
|
||||
if let Some(segments) = json.get("segments").and_then(|s| s.as_array())
|
||||
{
|
||||
if segments.len() > 1 {
|
||||
out.push_str("\n**Segments:**\n");
|
||||
for seg in segments.iter().take(20) {
|
||||
if let (Some(start), Some(end), Some(seg_text)) = (
|
||||
seg.get("start").and_then(|v| v.as_f64()),
|
||||
seg.get("end").and_then(|v| v.as_f64()),
|
||||
seg.get("text").and_then(|v| v.as_str()),
|
||||
) {
|
||||
let start_f = start;
|
||||
let end_f = end;
|
||||
out.push_str(&format!(
|
||||
"[{:05.1} - {:05.1}] {}\n",
|
||||
start_f, end_f, seg_text
|
||||
));
|
||||
}
|
||||
}
|
||||
if segments.len() > 20 {
|
||||
out.push_str(&format!(
|
||||
"... and {} more segments\n",
|
||||
segments.len() - 20
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ToolResult {
|
||||
success: true,
|
||||
output: out,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
Err(_) => Ok(ToolResult {
|
||||
success: true,
|
||||
output: stdout.to_string(),
|
||||
error: None,
|
||||
}),
|
||||
}
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&result.stderr);
|
||||
Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Transcription failed: {}", stderr.trim())),
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Failed to run transcription: {}", e)),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user