Extract audio from MP4 videos, transcribe speech with Whisper, and analyze emotions using LLMs. Generates interactive and static emotion timelines with comprehensive sentiment analysis.
Analyzes emotions in MP4 video files by extracting audio, transcribing speech using OpenAI Whisper, and performing LLM-based emotion analysis. Generates interactive and static emotion timeline visualizations with comprehensive sentiment insights.
This skill processes video files through a multi-stage pipeline:
1. **Audio Extraction** - Extracts and preprocesses audio from MP4/AVI/MOV/MKV/WebM files
2. **Speech Transcription** - Converts audio to text using OpenAI Whisper with word-level timestamps
3. **Emotion Analysis** - Analyzes 10 emotion types using OpenAI or Anthropic LLMs
4. **Timeline Visualization** - Generates interactive HTML and static PNG/SVG emotion charts
5. **Results Export** - Saves analysis in JSON, CSV, or TXT formats
The project uses `uv` as the package manager. Ensure these dependencies are in `pyproject.toml`:
```toml
dependencies = [
"moviepy>=1.0.3",
"openai-whisper>=20240930",
"openai>=1.51.0",
"librosa>=0.10.2",
"typer>=0.12.0",
"rich>=13.7.0",
"pydantic>=2.9.0",
"matplotlib>=3.8.0",
"seaborn>=0.13.0",
"plotly>=5.17.0",
"kaleido>=0.2.1"
]
```
Install dependencies:
```bash
uv sync
```
Create `.env` from `.env.example` and configure API keys:
```bash
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
WHISPER_MODEL=base
LLM_PROVIDER=openai
LLM_MODEL=gpt-4
```
The system consists of these key modules:
Extract audio from video files with preprocessing:
```python
from moviepy.editor import VideoFileClip
import librosa
import numpy as np
def extract_audio(video_path: str, output_path: str = "temp_audio.wav"):
"""Extract audio from MP4 and preprocess for Whisper."""
# Extract audio at 16kHz (Whisper requirement)
video = VideoFileClip(video_path)
video.audio.write_audiofile(output_path, fps=16000)
# Load and preprocess
audio, sr = librosa.load(output_path, sr=16000)
# Normalize audio
audio = audio / np.max(np.abs(audio))
# Remove silence (top_db=20)
audio, _ = librosa.effects.trim(audio, top_db=20)
# Apply high-pass filter to remove low-frequency noise
audio = librosa.effects.preemphasis(audio)
return audio, sr
```
Use Whisper to transcribe audio with timestamps:
```python
import whisper
def transcribe_audio(audio_path: str, model_name: str = "base", language: str = None):
"""Transcribe audio using OpenAI Whisper."""
model = whisper.load_model(model_name)
# Transcribe with word-level timestamps
result = model.transcribe(
audio_path,
language=language, # None = auto-detect
word_timestamps=True,
verbose=False
)
# Structure segments with timestamps
segments = []
for segment in result["segments"]:
segments.append({
"start": segment["start"],
"end": segment["end"],
"text": segment["text"],
"confidence": segment.get("confidence", 1.0)
})
return segments
```
Analyze emotions using LLMs:
```python
from openai import OpenAI
from anthropic import Anthropic
from pydantic import BaseModel
class Emotion(BaseModel):
type: str # joy, sadness, anger, fear, surprise, disgust, neutral, excitement, anxiety, contentment
confidence: float # 0.0 to 1.0
class EmotionSegment(BaseModel):
start_time: float
end_time: float
text: str
emotions: list[Emotion]
dominant_emotion: str
sentiment_score: float # -1.0 (negative) to 1.0 (positive)
def analyze_emotions(segments: list, provider: str = "openai", model: str = "gpt-4"):
"""Analyze emotions for each transcript segment."""
if provider == "openai":
client = OpenAI()
else:
client = Anthropic()
emotion_segments = []
for segment in segments:
prompt = f"""Analyze the emotions in this text segment:
Text: {segment['text']}
Identify emotions from: joy, sadness, anger, fear, surprise, disgust, neutral, excitement, anxiety, contentment.
For each detected emotion, provide a confidence score (0.0-1.0).
Also provide a sentiment score from -1.0 (very negative) to 1.0 (very positive).
Return JSON format."""
# Call LLM and parse response
# (Implementation depends on provider)
emotion_segments.append(result)
return emotion_segments
```
Generate emotion timeline visualizations:
```python
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
def create_interactive_timeline(emotion_segments: list, output_path: str = "emotions_timeline.html"):
"""Create interactive Plotly emotion timeline."""
fig = go.Figure()
# Create timeline bars for each emotion segment
for seg in emotion_segments:
fig.add_trace(go.Bar(
x=[seg.end_time - seg.start_time],
y=[seg.dominant_emotion],
base=seg.start_time,
orientation='h',
name=f"{seg.start_time:.1f}s",
text=seg.text,
hovertemplate=f"<b>{seg.dominant_emotion}</b><br>Time: {seg.start_time:.1f}s - {seg.end_time:.1f}s<br>Text: {seg.text}<br>Sentiment: {seg.sentiment_score:.2f}<extra></extra>"
))
fig.update_layout(
title="Emotion Timeline",
xaxis_title="Time (seconds)",
yaxis_title="Emotion",
barmode='overlay',
height=600
)
fig.write_html(output_path)
def create_static_charts(emotion_segments: list, output_dir: str = "."):
"""Create static matplotlib/seaborn charts."""
# Emotion timeline
plt.figure(figsize=(14, 6))
# Plot implementation...
plt.savefig(f"{output_dir}/emotions_timeline.png", dpi=300, bbox_inches='tight')
# Summary dashboard with pie chart, statistics
# Implementation...
```
Implement Typer-based CLI:
```python
import typer
from rich.console import Console
from rich.progress import track
app = typer.Typer()
console = Console()
@app.command()
def analyze(
video_path: str,
format: str = "json",
whisper_model: str = "base",
llm_model: str = "gpt-4",
visualize: bool = True,
viz_format: str = "both", # png, html, both
static: bool = False,
interactive: bool = False
):
"""Analyze emotions in a video file."""
# Validate API keys
# Extract audio
# Transcribe
# Analyze emotions
# Generate visualizations
# Save results
@app.command()
def batch(
video_dir: str,
output_dir: str = "./results",
format: str = "json",
visualize: bool = True
):
"""Batch process multiple video files."""
# Process all videos in directory
@app.command()
def info(video_path: str):
"""Display video information."""
# Show video metadata
if __name__ == "__main__":
app()
```
```bash
python main.py analyze video.mp4
```
Generates:
```bash
python main.py analyze video.mp4 --format csv --viz-format png --static
python main.py analyze video.mp4 --viz-format html --interactive
python main.py analyze video.mp4 --viz-format both
```
```bash
python main.py analyze video.mp4 --whisper-model medium --llm-model gpt-4
python main.py batch /videos/ --output-dir /results/ --visualize
python main.py analyze video.mp4 --no-visualize
```
Define Pydantic models in `models.py`:
```python
from pydantic import BaseModel
from enum import Enum
class EmotionType(str, Enum):
JOY = "joy"
SADNESS = "sadness"
ANGER = "anger"
FEAR = "fear"
SURPRISE = "surprise"
DISGUST = "disgust"
NEUTRAL = "neutral"
EXCITEMENT = "excitement"
ANXIETY = "anxiety"
CONTENTMENT = "contentment"
class Emotion(BaseModel):
type: EmotionType
confidence: float # 0.0 to 1.0
class TranscriptSegment(BaseModel):
start: float
end: float
text: str
confidence: float
class EmotionSegment(BaseModel):
start_time: float
end_time: float
text: str
emotions: list[Emotion]
dominant_emotion: EmotionType
sentiment_score: float # -1.0 to 1.0
class EmotionAnalysisResult(BaseModel):
video_path: str
duration: float
segments: list[EmotionSegment]
overall_sentiment: float
emotion_distribution: dict[EmotionType, float]
```
Implement robust error handling:
```python
import typer
from pathlib import Path
def validate_api_keys():
"""Validate required API keys are present."""
if not os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"):
raise ValueError("At least one API key (OPENAI_API_KEY or ANTHROPIC_API_KEY) must be set")
def validate_video_file(path: str):
"""Validate video file exists and has supported format."""
file_path = Path(path)
if not file_path.exists():
raise FileNotFoundError(f"Video file not found: {path}")
supported_formats = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
if file_path.suffix.lower() not in supported_formats:
raise ValueError(f"Unsupported format: {file_path.suffix}. Supported: {supported_formats}")
import atexit
import tempfile
temp_files = []
def cleanup_temp_files():
for file in temp_files:
try:
os.remove(file)
except:
pass
atexit.register(cleanup_temp_files)
```
1. **Audio Processing**: Always extract audio at 16kHz sample rate for Whisper compatibility
2. **API Keys**: Validate API keys before starting processing to fail fast
3. **File Cleanup**: Automatically clean up temporary audio files after processing
4. **Segmentation**: For videos longer than 30 minutes, use 30-second segments with 5-second overlap
5. **Multi-language**: Support Japanese and other Unicode text in visualizations (set matplotlib font config)
6. **Memory Management**: Process large videos in chunks to avoid memory issues
7. **Error Recovery**: Implement continue-on-error for batch processing to handle individual file failures
8. **Output Validation**: Ensure all output files are written before completing analysis
Leave a review
No reviews yet. Be the first to review this skill!
# Download SKILL.md from killerskills.ai/api/skills/mp4-emotion-analyzer/raw