From 5632a3d6410e2c90699dfdd169957867a0e1d64e Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 12:02:33 +0200 Subject: [PATCH] Initial release: GTS-HolMirDas v1.0 RSS-based content discovery for GoToSocial instances. Features: - Multi-instance RSS feed processing - Docker deployment with .env configuration - Comprehensive statistics and monitoring - Production-ready with proper secret management --- .env.example | 14 +++ .gitignore | 29 +++++ Dockerfile | 24 ++++ LICENSE | 21 ++++ README.md | 175 +++++++++++++++++++++++++++ compose.yml | 32 +++++ gts_holmirdas.py | 275 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 + rss_feeds.example.txt | 15 +++ 9 files changed, 588 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 compose.yml create mode 100644 gts_holmirdas.py create mode 100644 requirements.txt create mode 100644 rss_feeds.example.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..23d6c88 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# GTS Server Configuration +GTS_SERVER_URL=https://your-gts-instance.tld +GTS_ACCESS_TOKEN=your_gts_access_token_here + +# Processing Configuration +MAX_POSTS_PER_RUN=25 +DELAY_BETWEEN_REQUESTS=1 +LOG_LEVEL=INFO + +# RSS Configuration +RSS_URLS_FILE=/app/rss_feeds.txt + +# Optional: Monitoring +# HEALTHCHECK_URL=https://hc-ping.com/your-uuid-here diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e5dcf8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Secrets and local config +.env +rss_feeds.txt + +# Data directory +data/ +*.json + +# Python +__pycache__/ +*.pyc +*.pyo + +# Docker +.dockerignore + +# Logs +*.log + +# Editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..57540bd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Dockerfile +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Copy and install requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Create data directory +RUN mkdir -p /app/data + +# Create non-root user +RUN useradd -r -u 1000 holmirdas + +# Set ownership +RUN chown -R holmirdas:holmirdas /app + +# Switch to non-root user +USER holmirdas + +# Default command (will be overridden by docker-compose) +CMD ["python", "gts_holmirdas.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..450a765 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Matthias + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a4fcd75 --- /dev/null +++ b/README.md @@ -0,0 +1,175 @@ +# GTS-HolMirDas šŸš€ + +RSS-based content discovery for GoToSocial instances. + +Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. + +## Features + +- šŸ“” **Multi-Instance RSS Discovery** - Fetches content from configurable RSS feeds across Fediverse instances +- ⚔ **Efficient Processing** - Configurable rate limiting and duplicate detection +- šŸ”§ **Production Ready** - Environment-based config, Docker deployment, health monitoring +- šŸ“Š **Comprehensive Statistics** - Runtime metrics, content processing, and federation growth tracking +- 🐳 **Containerized** - Simple Docker Compose deployment +- šŸ“ **File-based Configuration** - Easy RSS feed management via text files + +## How it Works + +GTS-HolMirDas reads RSS feeds from various Fediverse instances and uses GoToSocial's search API to federate the discovered content. This approach: + +- Maintains proper ActivityPub federation (posts remain interactive) +- Respects rate limits and instance policies +- Provides better content discovery for small instances +- Works alongside tools like FediFetcher for comprehensive federation + +## Quick Start + +```bash +# Clone the repository +git clone https://your-forgejo-instance.tld/user/gts-holmirdas +cd gts-holmirdas + +# Copy configuration templates +cp .env.example .env +cp rss_feeds.example.txt rss_feeds.txt + +# Edit configuration +nano .env # Add your GTS credentials +nano rss_feeds.txt # Customize RSS feeds + +# Deploy +docker-compose up -d + +# Monitor +docker-compose logs -f +``` + +## Configuration + +### Environment Variables (.env) + +```bash +# GTS Server Configuration +GTS_SERVER_URL=https://your-gts-instance.tld +GTS_ACCESS_TOKEN=your_gts_access_token + +# Processing Configuration +MAX_POSTS_PER_RUN=25 # Posts per feed per run +DELAY_BETWEEN_REQUESTS=1 # Seconds between API calls +LOG_LEVEL=INFO # Logging verbosity + +# RSS Configuration +RSS_URLS_FILE=/app/rss_feeds.txt # Path to RSS feeds file + +# Optional: Monitoring +HEALTHCHECK_URL=https://hc-ping.com/your-uuid +``` + +### RSS Feeds (rss_feeds.txt) + +``` +# Example RSS feeds - customize for your interests +# homelab +https://mastodon.social/tags/homelab.rss +https://fosstodon.org/tags/homelab.rss + +# selfhosting +https://mastodon.social/tags/selfhosting.rss +https://infosec.exchange/tags/selfhosting.rss + +# Add your preferred instances and hashtags +``` + +## Access Token Setup + +1. Login to your GoToSocial instance +2. Go to Settings → Applications +3. Create new application with scopes: `read`, `read:search`, `read:statuses` +4. Copy the access token to your `.env` file + +## Statistics Output + +``` +šŸ“Š GTS-HolMirDas Run Statistics: + ā±ļø Runtime: 0:04:14 + šŸ“„ Total posts processed: 45 + 🌐 Current known instances: 2519 + āž• New instances discovered: +3 + šŸ“” RSS feeds processed: 25 + ⚔ Posts per minute: 10.6 +``` + +## Resource Requirements + +- **Memory**: ~200-500MB depending on feed count +- **CPU**: Minimal (mostly I/O bound) +- **Storage**: <100MB for application, plus log storage +- **Network**: Depends on RSS feed count and frequency + +## Deployment Options + +### Docker Compose (Recommended) +```bash +docker-compose up -d +``` + +### Standalone Docker +```bash +docker build -t gts-holmirdas . +docker run -d --env-file .env \ + -v ./data:/app/data \ + -v ./gts_holmirdas.py:/app/gts_holmirdas.py:ro \ + -v ./rss_feeds.txt:/app/rss_feeds.txt:ro \ + gts-holmirdas +``` + +## Monitoring + +- **Logs**: `docker-compose logs -f` +- **Health**: Optional Healthchecks.io integration +- **Statistics**: Built-in runtime and performance metrics +- **Resource Usage**: Docker stats or container monitoring tools + +## Troubleshooting + +### Common Issues + +**No posts processed**: Check access token permissions and RSS feed URLs + +**Rate limiting errors**: Increase `DELAY_BETWEEN_REQUESTS` or reduce feed count + +**High memory usage**: Reduce `MAX_POSTS_PER_RUN` or feed frequency + +**Container won't start**: Verify `.env` file format and required variables + +### Debug Mode + +```bash +# Enable debug logging +echo "LOG_LEVEL=DEBUG" >> .env +docker-compose restart gts-holmirdas +``` + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Test thoroughly +5. Submit a pull request + +## Related Projects + +- [FediFetcher](https://github.com/nanos/fedifetcher) - Fetches missing replies and posts +- [GoToSocial](https://github.com/superseriousbusiness/gotosocial) - Lightweight ActivityPub server +- [slurp](https://github.com/VyrCossont/slurp) - Import posts from other instances + +## License + +MIT License - see LICENSE file for details. + +## Acknowledgments + +- Inspired by the [HolMirDas](https://github.com/aliceif/HolMirDas) concept +- Built for the GoToSocial community +- RSS-to-ActivityPub approach inspired by Fediverse discovery challenges diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..c64bc46 --- /dev/null +++ b/compose.yml @@ -0,0 +1,32 @@ +services: + gts-holmirdas: + build: . + container_name: gts-holmirdas + restart: unless-stopped + + env_file: + - .env + + volumes: + - ./data:/app/data + - ./gts_holmirdas.py:/app/gts_holmirdas.py:ro + - ./rss_feeds.txt:/app/rss_feeds.txt:ro + + # Run every 3 hours (balanced frequency) + entrypoint: > + sh -c " + while true; do + echo 'Starting GTS-HolMirDas run...' + python gts_holmirdas.py + echo 'GTS-HolMirDas run completed. Sleeping for 1 hour...' + sleep 3600 + done + " + + # Resource limits + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M diff --git a/gts_holmirdas.py b/gts_holmirdas.py new file mode 100644 index 0000000..2137c32 --- /dev/null +++ b/gts_holmirdas.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +GTS-HolMirDas: RSS-based content discovery for GoToSocial +Fetches URLs from RSS feeds and uses GTS search API to federate content +""" + +import os +import sys +import time +import json +import logging +import requests +import feedparser +from datetime import timedelta +from urllib.parse import quote_plus + +class GTSHolMirDas: + def __init__(self): + """Initialize the RSS fetcher with configuration""" + self.config = { + "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), + "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), + "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), + "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), + "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), + "log_level": os.getenv("LOG_LEVEL", "INFO") + } + + # Setup logging FIRST + logging.basicConfig( + level=getattr(logging, self.config["log_level"]), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # Load RSS URLs from file or environment + rss_urls_file = os.getenv("RSS_URLS_FILE") + if rss_urls_file and os.path.exists(rss_urls_file): + # Load from file + try: + with open(rss_urls_file, 'r') as f: + self.config["rss_urls"] = [ + line.strip() for line in f + if line.strip() and not line.startswith('#') + ] + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") + except Exception as e: + self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") + self.config["rss_urls"] = [] + else: + # Fallback to environment variable + self.config["rss_urls"] = [ + url.strip() for url in os.getenv("RSS_URLS", "").split(",") + if url.strip() + ] + if self.config["rss_urls"]: + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") + + # Load processed URLs from persistent storage + self.processed_urls_file = "/app/data/processed_urls.json" + self.processed_urls = self.load_processed_urls() + + # Statistics tracking + self.previous_instances = getattr(self, 'previous_instances', 0) + + def load_processed_urls(self): + """Load previously processed URLs and instance count from file""" + try: + if os.path.exists(self.processed_urls_file): + with open(self.processed_urls_file, 'r') as f: + data = json.load(f) + # Load previous instance count for statistics + self.previous_instances = data.get('previous_instances', 0) + return set(data.get('processed_urls', [])) + except Exception as e: + self.logger.warning(f"Could not load processed URLs: {e}") + + return set() + + def save_processed_urls(self, current_instances=None): + """Save processed URLs and current instance count to file""" + try: + os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) + data = { + 'processed_urls': list(self.processed_urls), + 'last_updated': time.time() + } + # Save current instance count for next run + if current_instances is not None and current_instances != 'unknown': + data['previous_instances'] = current_instances + + with open(self.processed_urls_file, 'w') as f: + json.dump(data, f, indent=2) + except Exception as e: + self.logger.error(f"Could not save processed URLs: {e}") + + def fetch_rss_urls(self, rss_url): + """Fetch URLs from RSS feed""" + try: + self.logger.info(f"Fetching RSS feed: {rss_url}") + + # Parse RSS feed + feed = feedparser.parse(rss_url) + + if feed.bozo: + self.logger.warning(f"RSS feed may have issues: {rss_url}") + + # Extract URLs from entries + urls = [] + for entry in feed.entries: + if hasattr(entry, 'link'): + urls.append(entry.link) + + self.logger.info(f"Found {len(urls)} URLs in RSS feed") + return urls + + except Exception as e: + self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") + return [] + + def lookup_post(self, post_url): + """Look up a post URL using GTS search API""" + try: + # Prepare search API call + search_url = f"{self.config['server_url']}/api/v2/search" + params = { + 'q': post_url, + 'type': 'statuses', + 'resolve': 'true', + 'limit': 1 + } + headers = { + 'Authorization': f'Bearer {self.config["access_token"]}', + 'Content-Type': 'application/json' + } + + # Make API call + response = requests.get( + search_url, + params=params, + headers=headers, + timeout=30 + ) + + if response.status_code == 200: + results = response.json() + if results.get('statuses') or results.get('accounts'): + self.logger.info(f"Successfully looked up: {post_url}") + return True + else: + self.logger.warning(f"No results for: {post_url}") + return False + else: + self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") + return False + + except requests.exceptions.RequestException as e: + self.logger.error(f"Error looking up {post_url}: {e}") + return False + + def process_feeds(self): + """Process all configured RSS feeds""" + total_processed = 0 + + # Record start time for statistics + self.start_time = time.time() + + # Ping healthcheck start + self.ping_healthcheck("/start") + + try: + for rss_url in self.config["rss_urls"]: + if not rss_url.strip(): + continue + + self.logger.info(f"Processing feed: {rss_url}") + + # Get URLs from RSS + urls = self.fetch_rss_urls(rss_url) + + # Filter out already processed URLs + new_urls = [url for url in urls if url not in self.processed_urls] + + if not new_urls: + self.logger.info("No new URLs to process") + continue + + # Rate limiting: max posts per run + urls_to_process = new_urls[:self.config["max_posts_per_run"]] + + self.logger.info(f"Processing {len(urls_to_process)} new URLs") + + for url in urls_to_process: + if self.lookup_post(url): + self.processed_urls.add(url) + total_processed += 1 + + # Rate limiting: delay between requests + time.sleep(self.config["delay_between_requests"]) + + # Calculate runtime + end_time = time.time() + runtime_seconds = end_time - self.start_time + runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) + + # Get current instance count + try: + instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", + headers={'Authorization': f'Bearer {self.config["access_token"]}'}, + timeout=10) + if instance_info.status_code == 200: + current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') + else: + current_instances = 'unknown' + except Exception as e: + self.logger.error(f"Failed to get instance count: {e}") + current_instances = 'unknown' + + # Calculate new instances (if we have previous data) + new_instances = 'unknown' + if self.previous_instances > 0 and current_instances != 'unknown': + new_instances = current_instances - self.previous_instances + + # Print comprehensive statistics + print(f"\nšŸ“Š GTS-HolMirDas Run Statistics:") + print(f" ā±ļø Runtime: {runtime_formatted}") + print(f" šŸ“„ Total posts processed: {total_processed}") + print(f" 🌐 Current known instances: {current_instances}") + if new_instances != 'unknown' and new_instances > 0: + print(f" āž• New instances discovered: +{new_instances}") + elif new_instances == 0: + print(f" āž• New instances discovered: +0") + print(f" šŸ“” RSS feeds processed: {len(self.config['rss_urls'])}") + if runtime_seconds > 60: + print(f" ⚔ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") + + self.save_processed_urls(current_instances) + + # Ping healthcheck success + self.ping_healthcheck("") + + except Exception as e: + self.logger.error(f"Error during processing: {e}") + # Ping healthcheck failure + self.ping_healthcheck("/fail") + raise + + def ping_healthcheck(self, endpoint=""): + """Ping healthchecks.io for monitoring""" + if not self.config.get("healthcheck_url"): + return + + try: + url = self.config["healthcheck_url"] + endpoint + requests.get(url, timeout=10) + except Exception as e: + self.logger.warning(f"Failed to ping healthcheck: {e}") + +def main(): + """Main entry point""" + try: + fetcher = GTSHolMirDas() + + # Validate required config + if not fetcher.config["access_token"]: + raise ValueError("GTS_ACCESS_TOKEN environment variable is required") + + fetcher.process_feeds() + + except Exception as e: + logging.error(f"Fatal error: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f2d32a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.31.0 +feedparser==6.0.10 +urllib3==2.0.7 diff --git a/rss_feeds.example.txt b/rss_feeds.example.txt new file mode 100644 index 0000000..2284fc4 --- /dev/null +++ b/rss_feeds.example.txt @@ -0,0 +1,15 @@ +# Example RSS feeds - customize for your interests + +# homelab +https://mastodon.social/tags/homelab.rss +https://fosstodon.org/tags/homelab.rss + +# selfhosting +https://mastodon.social/tags/selfhosting.rss +https://infosec.exchange/tags/selfhosting.rss + +# docker +https://social.tchncs.de/tags/docker.rss +https://fosstodon.org/tags/docker.rss + +# Add your preferred instances and hashtags...