From 5632a3d6410e2c90699dfdd169957867a0e1d64e Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 12:02:33 +0200 Subject: [PATCH 01/24] Initial release: GTS-HolMirDas v1.0 RSS-based content discovery for GoToSocial instances. Features: - Multi-instance RSS feed processing - Docker deployment with .env configuration - Comprehensive statistics and monitoring - Production-ready with proper secret management --- .env.example | 14 +++ .gitignore | 29 +++++ Dockerfile | 24 ++++ LICENSE | 21 ++++ README.md | 175 +++++++++++++++++++++++++++ compose.yml | 32 +++++ gts_holmirdas.py | 275 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 + rss_feeds.example.txt | 15 +++ 9 files changed, 588 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 compose.yml create mode 100644 gts_holmirdas.py create mode 100644 requirements.txt create mode 100644 rss_feeds.example.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..23d6c88 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# GTS Server Configuration +GTS_SERVER_URL=https://your-gts-instance.tld +GTS_ACCESS_TOKEN=your_gts_access_token_here + +# Processing Configuration +MAX_POSTS_PER_RUN=25 +DELAY_BETWEEN_REQUESTS=1 +LOG_LEVEL=INFO + +# RSS Configuration +RSS_URLS_FILE=/app/rss_feeds.txt + +# Optional: Monitoring +# HEALTHCHECK_URL=https://hc-ping.com/your-uuid-here diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e5dcf8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Secrets and local config +.env +rss_feeds.txt + +# Data directory +data/ +*.json + +# Python +__pycache__/ +*.pyc +*.pyo + +# Docker +.dockerignore + +# Logs +*.log + +# Editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..57540bd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Dockerfile +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Copy and install requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Create data directory +RUN mkdir -p /app/data + +# Create non-root user +RUN useradd -r -u 1000 holmirdas + +# Set ownership +RUN chown -R holmirdas:holmirdas /app + +# Switch to non-root user +USER holmirdas + +# Default command (will be overridden by docker-compose) +CMD ["python", "gts_holmirdas.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..450a765 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Matthias + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a4fcd75 --- /dev/null +++ b/README.md @@ -0,0 +1,175 @@ +# GTS-HolMirDas šŸš€ + +RSS-based content discovery for GoToSocial instances. + +Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. + +## Features + +- šŸ“” **Multi-Instance RSS Discovery** - Fetches content from configurable RSS feeds across Fediverse instances +- ⚔ **Efficient Processing** - Configurable rate limiting and duplicate detection +- šŸ”§ **Production Ready** - Environment-based config, Docker deployment, health monitoring +- šŸ“Š **Comprehensive Statistics** - Runtime metrics, content processing, and federation growth tracking +- 🐳 **Containerized** - Simple Docker Compose deployment +- šŸ“ **File-based Configuration** - Easy RSS feed management via text files + +## How it Works + +GTS-HolMirDas reads RSS feeds from various Fediverse instances and uses GoToSocial's search API to federate the discovered content. This approach: + +- Maintains proper ActivityPub federation (posts remain interactive) +- Respects rate limits and instance policies +- Provides better content discovery for small instances +- Works alongside tools like FediFetcher for comprehensive federation + +## Quick Start + +```bash +# Clone the repository +git clone https://your-forgejo-instance.tld/user/gts-holmirdas +cd gts-holmirdas + +# Copy configuration templates +cp .env.example .env +cp rss_feeds.example.txt rss_feeds.txt + +# Edit configuration +nano .env # Add your GTS credentials +nano rss_feeds.txt # Customize RSS feeds + +# Deploy +docker-compose up -d + +# Monitor +docker-compose logs -f +``` + +## Configuration + +### Environment Variables (.env) + +```bash +# GTS Server Configuration +GTS_SERVER_URL=https://your-gts-instance.tld +GTS_ACCESS_TOKEN=your_gts_access_token + +# Processing Configuration +MAX_POSTS_PER_RUN=25 # Posts per feed per run +DELAY_BETWEEN_REQUESTS=1 # Seconds between API calls +LOG_LEVEL=INFO # Logging verbosity + +# RSS Configuration +RSS_URLS_FILE=/app/rss_feeds.txt # Path to RSS feeds file + +# Optional: Monitoring +HEALTHCHECK_URL=https://hc-ping.com/your-uuid +``` + +### RSS Feeds (rss_feeds.txt) + +``` +# Example RSS feeds - customize for your interests +# homelab +https://mastodon.social/tags/homelab.rss +https://fosstodon.org/tags/homelab.rss + +# selfhosting +https://mastodon.social/tags/selfhosting.rss +https://infosec.exchange/tags/selfhosting.rss + +# Add your preferred instances and hashtags +``` + +## Access Token Setup + +1. Login to your GoToSocial instance +2. Go to Settings → Applications +3. Create new application with scopes: `read`, `read:search`, `read:statuses` +4. Copy the access token to your `.env` file + +## Statistics Output + +``` +šŸ“Š GTS-HolMirDas Run Statistics: + ā±ļø Runtime: 0:04:14 + šŸ“„ Total posts processed: 45 + 🌐 Current known instances: 2519 + āž• New instances discovered: +3 + šŸ“” RSS feeds processed: 25 + ⚔ Posts per minute: 10.6 +``` + +## Resource Requirements + +- **Memory**: ~200-500MB depending on feed count +- **CPU**: Minimal (mostly I/O bound) +- **Storage**: <100MB for application, plus log storage +- **Network**: Depends on RSS feed count and frequency + +## Deployment Options + +### Docker Compose (Recommended) +```bash +docker-compose up -d +``` + +### Standalone Docker +```bash +docker build -t gts-holmirdas . +docker run -d --env-file .env \ + -v ./data:/app/data \ + -v ./gts_holmirdas.py:/app/gts_holmirdas.py:ro \ + -v ./rss_feeds.txt:/app/rss_feeds.txt:ro \ + gts-holmirdas +``` + +## Monitoring + +- **Logs**: `docker-compose logs -f` +- **Health**: Optional Healthchecks.io integration +- **Statistics**: Built-in runtime and performance metrics +- **Resource Usage**: Docker stats or container monitoring tools + +## Troubleshooting + +### Common Issues + +**No posts processed**: Check access token permissions and RSS feed URLs + +**Rate limiting errors**: Increase `DELAY_BETWEEN_REQUESTS` or reduce feed count + +**High memory usage**: Reduce `MAX_POSTS_PER_RUN` or feed frequency + +**Container won't start**: Verify `.env` file format and required variables + +### Debug Mode + +```bash +# Enable debug logging +echo "LOG_LEVEL=DEBUG" >> .env +docker-compose restart gts-holmirdas +``` + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Test thoroughly +5. Submit a pull request + +## Related Projects + +- [FediFetcher](https://github.com/nanos/fedifetcher) - Fetches missing replies and posts +- [GoToSocial](https://github.com/superseriousbusiness/gotosocial) - Lightweight ActivityPub server +- [slurp](https://github.com/VyrCossont/slurp) - Import posts from other instances + +## License + +MIT License - see LICENSE file for details. + +## Acknowledgments + +- Inspired by the [HolMirDas](https://github.com/aliceif/HolMirDas) concept +- Built for the GoToSocial community +- RSS-to-ActivityPub approach inspired by Fediverse discovery challenges diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..c64bc46 --- /dev/null +++ b/compose.yml @@ -0,0 +1,32 @@ +services: + gts-holmirdas: + build: . + container_name: gts-holmirdas + restart: unless-stopped + + env_file: + - .env + + volumes: + - ./data:/app/data + - ./gts_holmirdas.py:/app/gts_holmirdas.py:ro + - ./rss_feeds.txt:/app/rss_feeds.txt:ro + + # Run every 3 hours (balanced frequency) + entrypoint: > + sh -c " + while true; do + echo 'Starting GTS-HolMirDas run...' + python gts_holmirdas.py + echo 'GTS-HolMirDas run completed. Sleeping for 1 hour...' + sleep 3600 + done + " + + # Resource limits + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M diff --git a/gts_holmirdas.py b/gts_holmirdas.py new file mode 100644 index 0000000..2137c32 --- /dev/null +++ b/gts_holmirdas.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +GTS-HolMirDas: RSS-based content discovery for GoToSocial +Fetches URLs from RSS feeds and uses GTS search API to federate content +""" + +import os +import sys +import time +import json +import logging +import requests +import feedparser +from datetime import timedelta +from urllib.parse import quote_plus + +class GTSHolMirDas: + def __init__(self): + """Initialize the RSS fetcher with configuration""" + self.config = { + "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), + "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), + "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), + "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), + "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), + "log_level": os.getenv("LOG_LEVEL", "INFO") + } + + # Setup logging FIRST + logging.basicConfig( + level=getattr(logging, self.config["log_level"]), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # Load RSS URLs from file or environment + rss_urls_file = os.getenv("RSS_URLS_FILE") + if rss_urls_file and os.path.exists(rss_urls_file): + # Load from file + try: + with open(rss_urls_file, 'r') as f: + self.config["rss_urls"] = [ + line.strip() for line in f + if line.strip() and not line.startswith('#') + ] + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") + except Exception as e: + self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") + self.config["rss_urls"] = [] + else: + # Fallback to environment variable + self.config["rss_urls"] = [ + url.strip() for url in os.getenv("RSS_URLS", "").split(",") + if url.strip() + ] + if self.config["rss_urls"]: + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") + + # Load processed URLs from persistent storage + self.processed_urls_file = "/app/data/processed_urls.json" + self.processed_urls = self.load_processed_urls() + + # Statistics tracking + self.previous_instances = getattr(self, 'previous_instances', 0) + + def load_processed_urls(self): + """Load previously processed URLs and instance count from file""" + try: + if os.path.exists(self.processed_urls_file): + with open(self.processed_urls_file, 'r') as f: + data = json.load(f) + # Load previous instance count for statistics + self.previous_instances = data.get('previous_instances', 0) + return set(data.get('processed_urls', [])) + except Exception as e: + self.logger.warning(f"Could not load processed URLs: {e}") + + return set() + + def save_processed_urls(self, current_instances=None): + """Save processed URLs and current instance count to file""" + try: + os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) + data = { + 'processed_urls': list(self.processed_urls), + 'last_updated': time.time() + } + # Save current instance count for next run + if current_instances is not None and current_instances != 'unknown': + data['previous_instances'] = current_instances + + with open(self.processed_urls_file, 'w') as f: + json.dump(data, f, indent=2) + except Exception as e: + self.logger.error(f"Could not save processed URLs: {e}") + + def fetch_rss_urls(self, rss_url): + """Fetch URLs from RSS feed""" + try: + self.logger.info(f"Fetching RSS feed: {rss_url}") + + # Parse RSS feed + feed = feedparser.parse(rss_url) + + if feed.bozo: + self.logger.warning(f"RSS feed may have issues: {rss_url}") + + # Extract URLs from entries + urls = [] + for entry in feed.entries: + if hasattr(entry, 'link'): + urls.append(entry.link) + + self.logger.info(f"Found {len(urls)} URLs in RSS feed") + return urls + + except Exception as e: + self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") + return [] + + def lookup_post(self, post_url): + """Look up a post URL using GTS search API""" + try: + # Prepare search API call + search_url = f"{self.config['server_url']}/api/v2/search" + params = { + 'q': post_url, + 'type': 'statuses', + 'resolve': 'true', + 'limit': 1 + } + headers = { + 'Authorization': f'Bearer {self.config["access_token"]}', + 'Content-Type': 'application/json' + } + + # Make API call + response = requests.get( + search_url, + params=params, + headers=headers, + timeout=30 + ) + + if response.status_code == 200: + results = response.json() + if results.get('statuses') or results.get('accounts'): + self.logger.info(f"Successfully looked up: {post_url}") + return True + else: + self.logger.warning(f"No results for: {post_url}") + return False + else: + self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") + return False + + except requests.exceptions.RequestException as e: + self.logger.error(f"Error looking up {post_url}: {e}") + return False + + def process_feeds(self): + """Process all configured RSS feeds""" + total_processed = 0 + + # Record start time for statistics + self.start_time = time.time() + + # Ping healthcheck start + self.ping_healthcheck("/start") + + try: + for rss_url in self.config["rss_urls"]: + if not rss_url.strip(): + continue + + self.logger.info(f"Processing feed: {rss_url}") + + # Get URLs from RSS + urls = self.fetch_rss_urls(rss_url) + + # Filter out already processed URLs + new_urls = [url for url in urls if url not in self.processed_urls] + + if not new_urls: + self.logger.info("No new URLs to process") + continue + + # Rate limiting: max posts per run + urls_to_process = new_urls[:self.config["max_posts_per_run"]] + + self.logger.info(f"Processing {len(urls_to_process)} new URLs") + + for url in urls_to_process: + if self.lookup_post(url): + self.processed_urls.add(url) + total_processed += 1 + + # Rate limiting: delay between requests + time.sleep(self.config["delay_between_requests"]) + + # Calculate runtime + end_time = time.time() + runtime_seconds = end_time - self.start_time + runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) + + # Get current instance count + try: + instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", + headers={'Authorization': f'Bearer {self.config["access_token"]}'}, + timeout=10) + if instance_info.status_code == 200: + current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') + else: + current_instances = 'unknown' + except Exception as e: + self.logger.error(f"Failed to get instance count: {e}") + current_instances = 'unknown' + + # Calculate new instances (if we have previous data) + new_instances = 'unknown' + if self.previous_instances > 0 and current_instances != 'unknown': + new_instances = current_instances - self.previous_instances + + # Print comprehensive statistics + print(f"\nšŸ“Š GTS-HolMirDas Run Statistics:") + print(f" ā±ļø Runtime: {runtime_formatted}") + print(f" šŸ“„ Total posts processed: {total_processed}") + print(f" 🌐 Current known instances: {current_instances}") + if new_instances != 'unknown' and new_instances > 0: + print(f" āž• New instances discovered: +{new_instances}") + elif new_instances == 0: + print(f" āž• New instances discovered: +0") + print(f" šŸ“” RSS feeds processed: {len(self.config['rss_urls'])}") + if runtime_seconds > 60: + print(f" ⚔ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") + + self.save_processed_urls(current_instances) + + # Ping healthcheck success + self.ping_healthcheck("") + + except Exception as e: + self.logger.error(f"Error during processing: {e}") + # Ping healthcheck failure + self.ping_healthcheck("/fail") + raise + + def ping_healthcheck(self, endpoint=""): + """Ping healthchecks.io for monitoring""" + if not self.config.get("healthcheck_url"): + return + + try: + url = self.config["healthcheck_url"] + endpoint + requests.get(url, timeout=10) + except Exception as e: + self.logger.warning(f"Failed to ping healthcheck: {e}") + +def main(): + """Main entry point""" + try: + fetcher = GTSHolMirDas() + + # Validate required config + if not fetcher.config["access_token"]: + raise ValueError("GTS_ACCESS_TOKEN environment variable is required") + + fetcher.process_feeds() + + except Exception as e: + logging.error(f"Fatal error: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f2d32a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.31.0 +feedparser==6.0.10 +urllib3==2.0.7 diff --git a/rss_feeds.example.txt b/rss_feeds.example.txt new file mode 100644 index 0000000..2284fc4 --- /dev/null +++ b/rss_feeds.example.txt @@ -0,0 +1,15 @@ +# Example RSS feeds - customize for your interests + +# homelab +https://mastodon.social/tags/homelab.rss +https://fosstodon.org/tags/homelab.rss + +# selfhosting +https://mastodon.social/tags/selfhosting.rss +https://infosec.exchange/tags/selfhosting.rss + +# docker +https://social.tchncs.de/tags/docker.rss +https://fosstodon.org/tags/docker.rss + +# Add your preferred instances and hashtags... From 7f802646ab207f01ae25823dccbb8cbce046d7fb Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 10:37:17 +0000 Subject: [PATCH 02/24] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a4fcd75..6f437db 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ GTS-HolMirDas reads RSS feeds from various Fediverse instances and uses GoToSoci ```bash # Clone the repository -git clone https://your-forgejo-instance.tld/user/gts-holmirdas +git clone https://git.klein.ruhr/user/gts-holmirdas cd gts-holmirdas # Copy configuration templates @@ -38,10 +38,10 @@ nano .env # Add your GTS credentials nano rss_feeds.txt # Customize RSS feeds # Deploy -docker-compose up -d +docker compose up -d # Monitor -docker-compose logs -f +docker compose logs -f ``` ## Configuration @@ -91,7 +91,7 @@ https://infosec.exchange/tags/selfhosting.rss ``` šŸ“Š GTS-HolMirDas Run Statistics: - ā±ļø Runtime: 0:04:14 + ā±ļø Runtime: 0:04:14 šŸ“„ Total posts processed: 45 🌐 Current known instances: 2519 āž• New instances discovered: +3 @@ -110,7 +110,7 @@ https://infosec.exchange/tags/selfhosting.rss ### Docker Compose (Recommended) ```bash -docker-compose up -d +docker compose up -d ``` ### Standalone Docker @@ -147,7 +147,7 @@ docker run -d --env-file .env \ ```bash # Enable debug logging echo "LOG_LEVEL=DEBUG" >> .env -docker-compose restart gts-holmirdas +docker compose restart gts-holmirdas ``` ## Contributing From 3ddc757123185c862e2ba128bcc6c2d8b236b70d Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 10:40:25 +0000 Subject: [PATCH 03/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f437db..32dd3f5 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ docker run -d --env-file .env \ ## Monitoring -- **Logs**: `docker-compose logs -f` +- **Logs**: `docker compose logs -f` - **Health**: Optional Healthchecks.io integration - **Statistics**: Built-in runtime and performance metrics - **Resource Usage**: Docker stats or container monitoring tools From b2936259d5c134bf25d72a1abf04841807b9efed Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 11:28:55 +0000 Subject: [PATCH 04/24] Update README.md --- README.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 32dd3f5..35b0129 100644 --- a/README.md +++ b/README.md @@ -134,13 +134,10 @@ docker run -d --env-file .env \ ### Common Issues -**No posts processed**: Check access token permissions and RSS feed URLs - -**Rate limiting errors**: Increase `DELAY_BETWEEN_REQUESTS` or reduce feed count - -**High memory usage**: Reduce `MAX_POSTS_PER_RUN` or feed frequency - -**Container won't start**: Verify `.env` file format and required variables +- **No posts processed**: Check access token permissions and RSS feed URLs +- **Rate limiting errors**: Increase `DELAY_BETWEEN_REQUESTS` or reduce feed count +- **High memory usage**: Reduce `MAX_POSTS_PER_RUN` or feed frequency +- **Container won't start**: Verify `.env` file format and required variables ### Debug Mode From 1a79e6a4b442b7bb5c6d1603e511fd06bdd643cf Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 11:30:44 +0000 Subject: [PATCH 05/24] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 35b0129..125394d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GTS-HolMirDas šŸš€ -RSS-based content discovery for GoToSocial instances. +RSS-based content discovery for **[https://codeberg.org/superseriousbusiness/gotosocial](GoToSocial)** instances. Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. @@ -15,7 +15,7 @@ Automatically discovers and federates content from RSS feeds across the Fedivers ## How it Works -GTS-HolMirDas reads RSS feeds from various Fediverse instances and uses GoToSocial's search API to federate the discovered content. This approach: +**GTS-HolMirDas** reads RSS feeds from various Fediverse instances and uses GoToSocial's search API to federate the discovered content. This approach: - Maintains proper ActivityPub federation (posts remain interactive) - Respects rate limits and instance policies From bd4944c6191247727315f46e8d839c539f335ea2 Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 11:31:10 +0000 Subject: [PATCH 06/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 125394d..d364591 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GTS-HolMirDas šŸš€ -RSS-based content discovery for **[https://codeberg.org/superseriousbusiness/gotosocial](GoToSocial)** instances. +RSS-based content discovery for **[GoToSocial](https://codeberg.org/superseriousbusiness/gotosocial)** instances. Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. From f783c08909b42ef07f9f557ffa18f4b29020adff Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 13:05:38 +0000 Subject: [PATCH 07/24] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d364591..ca316f2 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ RSS-based content discovery for **[GoToSocial](https://codeberg.org/superserious Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. +*Inspired by the original [HolMirDas](https://github.com/aliceif/HolMirDas) for Misskey by [@aliceif](https://github.com/aliceif) ([@aliceif@mkultra.x27.one](https://mkultra.x27.one/@aliceif)), this GoToSocial adaptation extends the RSS-to-ActivityPub concept with enhanced Docker deployment and multi-instance processing.* + ## Features - šŸ“” **Multi-Instance RSS Discovery** - Fetches content from configurable RSS feeds across Fediverse instances @@ -167,6 +169,6 @@ MIT License - see LICENSE file for details. ## Acknowledgments -- Inspired by the [HolMirDas](https://github.com/aliceif/HolMirDas) concept +- Inspired by [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://github.com/aliceif) ([@aliceif@mkultra.x27.one](https://mkultra.x27.one/@aliceif)) - the original RSS-to-ActivityPub concept - Built for the GoToSocial community - RSS-to-ActivityPub approach inspired by Fediverse discovery challenges From 5b978f144518cad583f4000274f6a63b30ca02fa Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 15:07:44 +0200 Subject: [PATCH 08/24] Add proper attribution to @aliceif for original HolMirDas concept - Acknowledge original HolMirDas for Misskey by @aliceif - Link both GitHub and Misskey profiles for cross-platform discovery - Clarify relationship between original Misskey tool and GoToSocial adaptation --- gts_holmirdas.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gts_holmirdas.py b/gts_holmirdas.py index 2137c32..642695e 100644 --- a/gts_holmirdas.py +++ b/gts_holmirdas.py @@ -1,7 +1,13 @@ #!/usr/bin/env python3 """ GTS-HolMirDas: RSS-based content discovery for GoToSocial -Fetches URLs from RSS feeds and uses GTS search API to federate content + +Inspired by HolMirDas by @aliceif: +- GitHub: https://github.com/aliceif/HolMirDas +- Fediverse: @aliceif@mkultra.x27.one + +This GoToSocial adaptation extends the original RSS-to-ActivityPub concept +with Docker deployment, multi-instance processing, and comprehensive monitoring. """ import os From 86133f8a6d052fdb50c67d17642085e0eb92aedf Mon Sep 17 00:00:00 2001 From: matthias Date: Tue, 29 Jul 2025 14:18:51 +0000 Subject: [PATCH 09/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ca316f2..a81f861 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Automatically discovers and federates content from RSS feeds across the Fedivers ```bash # Clone the repository -git clone https://git.klein.ruhr/user/gts-holmirdas +git clone https://git.klein.ruhr/matthias/gts-holmirdas cd gts-holmirdas # Copy configuration templates From ae9c750da41c3d5813165721462b5b05fe4ae246 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 30 Jul 2025 11:59:43 +0200 Subject: [PATCH 10/24] first commit --- README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 From b1c6474a246051bd17fa2b6219be0c7097ee4250 Mon Sep 17 00:00:00 2001 From: matthias Date: Wed, 30 Jul 2025 10:10:24 +0000 Subject: [PATCH 11/24] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e69de29..dfbba42 100644 --- a/README.md +++ b/README.md @@ -0,0 +1 @@ +https://keyoxide.org/AF953733C09F6368797CA49E2F4968B5F7988AA5 \ No newline at end of file From cfd6fb294e11b362cab0da5a94d2202b4863a27a Mon Sep 17 00:00:00 2001 From: matthias Date: Wed, 30 Jul 2025 10:11:07 +0000 Subject: [PATCH 12/24] Add LICENSE --- LICENSE | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e69de29 From 74d2ed41b684fc0c22d497e5bf0ee574162f74e8 Mon Sep 17 00:00:00 2001 From: matthias Date: Wed, 30 Jul 2025 10:11:17 +0000 Subject: [PATCH 13/24] Update LICENSE --- LICENSE | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/LICENSE b/LICENSE index e69de29..2e97a1d 100644 --- a/LICENSE +++ b/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2024 oliverpifferi + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file From 80867fc857a7d8dd8bbba6097280dde8260c1bdb Mon Sep 17 00:00:00 2001 From: matthias Date: Wed, 30 Jul 2025 18:02:33 +0200 Subject: [PATCH 14/24] v1.1.0: Performance scaling with RSS URL parameters - Add support for ?limit= URL parameters (up to 100 posts/feed) - Update documentation with performance scaling guidelines - Add progressive scaling examples in rss_feeds.example.txt - Include optimization strategies and troubleshooting - Maintain backward compatibility with standard RSS feeds --- .env.example | 2 +- README.md | 131 ++++++++++++++++++++++++++++++++++++++++++ rss_feeds.example.txt | 22 +++---- 3 files changed, 144 insertions(+), 11 deletions(-) diff --git a/.env.example b/.env.example index 23d6c88..2593cf5 100644 --- a/.env.example +++ b/.env.example @@ -3,7 +3,7 @@ GTS_SERVER_URL=https://your-gts-instance.tld GTS_ACCESS_TOKEN=your_gts_access_token_here # Processing Configuration -MAX_POSTS_PER_RUN=25 +MAX_POSTS_PER_RUN=75 DELAY_BETWEEN_REQUESTS=1 LOG_LEVEL=INFO diff --git a/README.md b/README.md index ca316f2..b4e55ed 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,137 @@ docker compose up -d # Monitor docker compose logs -f ``` +# Performance Scaling & Configuration + +## šŸš€ RSS Feed Optimization (v1.1.0+) + +GTS-HolMirDas supports URL parameters to dramatically increase content discovery without additional API calls. + +### RSS Feed Limits + +Most Mastodon-compatible instances support the `?limit=X` parameter: + +``` +# Default behavior (20 posts per feed) +https://mastodon.social/tags/homelab.rss + +# Increased limits (up to 100 posts per feed) +https://mastodon.social/tags/homelab.rss?limit=50 +https://fosstodon.org/tags/docker.rss?limit=100 +``` + +**Supported limits:** 20 (default), 50, 75, 100 (instance-dependent) + +### Performance Impact + +| Configuration | Posts/Run | API Calls | Processing Time | +|---------------|-----------|-----------|-----------------| +| Standard (limit=20) | ~100 posts | 30+ feeds | 2-5 minutes | +| Optimized (limit=50) | ~300 posts | 30+ feeds | 5-10 minutes | +| Maximum (limit=100) | ~600 posts | 30+ feeds | 8-15 minutes | + +## āš™ļø Configuration Tuning + +### Environment Variables + +```env +# Processing Configuration +MAX_POSTS_PER_RUN=75 # Increase for higher limits +DELAY_BETWEEN_REQUESTS=1 # Balance speed vs. server load +RSS_URLS_FILE=/app/rss_feeds.txt + +# Recommended combinations: +# Conservative: MAX_POSTS_PER_RUN=40, limit=50 +# Balanced: MAX_POSTS_PER_RUN=75, limit=100 +# Aggressive: MAX_POSTS_PER_RUN=100, limit=100 +``` + +### RSS Feed Strategy + +``` +# Progressive scaling approach: +# 1. Start with mixed limits to test performance +# 2. Increase gradually based on server capacity +# 3. Monitor GoToSocial memory usage + +# Example progression: +https://mastodon.social/tags/homelab.rss?limit=50 +https://fosstodon.org/tags/selfhosting.rss?limit=75 +https://chaos.social/tags/docker.rss?limit=100 +``` + +## šŸ“Š Monitoring & Optimization + +### Performance Metrics + +The statistics output shows real-time performance: + +``` +šŸ“Š GTS-HolMirDas Run Statistics: + ā±ļø Runtime: 0:08:42 + šŸ“„ Total posts processed: 487 + 🌐 Current known instances: 3150 + āž• New instances discovered: +45 + šŸ“” RSS feeds processed: 102 + ⚔ Posts per minute: 56.0 +``` + +### Optimization Guidelines + +**Memory Usage:** +- Monitor GoToSocial memory consumption during runs +- Each 100 additional posts ā‰ˆ ~2-5MB additional RAM +- Recommended: 1GB+ RAM for aggressive configurations + +**Processing Time:** +- Scales linearly with `MAX_POSTS_PER_RUN Ɨ number_of_feeds` +- Duplicate detection becomes more important at scale +- Consider running frequency vs. content volume + +**Federation Growth:** +- Higher limits = more diverse instance discovery +- Expect 20-50+ new instances per optimized run +- Balance discovery rate with storage capacity + +### Troubleshooting High-Volume Setups + +**If processing takes too long:** +```env +MAX_POSTS_PER_RUN=50 # Reduce from 75/100 +DELAY_BETWEEN_REQUESTS=2 # Increase from 1 +``` + +**If GoToSocial uses too much memory:** +- Reduce RSS feed count temporarily +- Lower `?limit=` parameters to 50 instead of 100 +- Increase run frequency instead of volume + +**If duplicate detection is slow:** +- Storage cleanup: `docker-compose exec gts-holmirdas rm -f /app/data/processed_urls.json` +- This forces fresh state tracking (posts will be reprocessed once) + +## šŸŽÆ Best Practices + +### Scaling Strategy + +1. **Start Conservative:** `limit=50`, `MAX_POSTS_PER_RUN=40` +2. **Monitor Performance:** Check RAM usage and processing time +3. **Scale Gradually:** Increase to `limit=75`, then `limit=100` +4. **Optimize Mix:** Use different limits per instance based on quality + +### Instance Selection + +**High-quality instances for aggressive limits:** +``` +# Tech-focused instances (good signal-to-noise ratio) +https://fosstodon.org/tags/homelab.rss?limit=100 +https://infosec.exchange/tags/security.rss?limit=100 + +# General instances (moderate limits recommended) +https://mastodon.social/tags/technology.rss?limit=50 +``` + +**Performance tip:** Specialized instances often have higher content quality at scale than general-purpose instances. ## Configuration diff --git a/rss_feeds.example.txt b/rss_feeds.example.txt index 2284fc4..d532f51 100644 --- a/rss_feeds.example.txt +++ b/rss_feeds.example.txt @@ -1,15 +1,17 @@ # Example RSS feeds - customize for your interests -# homelab -https://mastodon.social/tags/homelab.rss -https://fosstodon.org/tags/homelab.rss +# Add ?limit=X parameter to increase posts per feed (default: 20, max: 100) +# Higher limits = more content discovery, but longer processing time +# Performance tip: Start with limit=50, then increase to 100 if needed -# selfhosting -https://mastodon.social/tags/selfhosting.rss -https://infosec.exchange/tags/selfhosting.rss +# homelab (up to 100 posts per feed) +https://mastodon.social/tags/homelab.rss # 20 posts/feed (default) +https://fosstodon.org/tags/homelab.rss?limit=50 # 50 posts/feed -# docker -https://social.tchncs.de/tags/docker.rss -https://fosstodon.org/tags/docker.rss +# selfhosting (up to 100 posts per feed) +https://mastodon.social/tags/selfhosting.rss?limit=100 # 100 posts/feed +https://infosec.exchange/tags/selfhosting.rss?limit=100 # 100 posts/feed -# Add your preferred instances and hashtags... +# docker (up to 100 posts per feed) +https://social.tchncs.de/tags/docker.rss?limit=100 # 100 posts/feed +https://fosstodon.org/tags/docker.rss?limit=100 # 100 posts/feed From 798433af0790c334725e1557b8aeb3cf41a8ddad Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 20:20:01 +0000 Subject: [PATCH 15/24] Streamline README, move detailed docs to Wiki --- README.md | 301 ++++++++++-------------------------------------------- 1 file changed, 54 insertions(+), 247 deletions(-) diff --git a/README.md b/README.md index 96916f9..c182610 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,20 @@ # GTS-HolMirDas šŸš€ -RSS-based content discovery for **[GoToSocial](https://codeberg.org/superseriousbusiness/gotosocial)** instances. +RSS-based content discovery for [GoToSocial](https://codeberg.org/superseriousbusiness/gotosocial) instances. Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. -*Inspired by the original [HolMirDas](https://github.com/aliceif/HolMirDas) for Misskey by [@aliceif](https://github.com/aliceif) ([@aliceif@mkultra.x27.one](https://mkultra.x27.one/@aliceif)), this GoToSocial adaptation extends the RSS-to-ActivityPub concept with enhanced Docker deployment and multi-instance processing.* +Inspired by the original [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://mkultra.x27.one/@aliceif), adapted for GoToSocial with enhanced Docker deployment and multi-instance processing. -## Features +## ✨ Key Features -- šŸ“” **Multi-Instance RSS Discovery** - Fetches content from configurable RSS feeds across Fediverse instances -- ⚔ **Efficient Processing** - Configurable rate limiting and duplicate detection -- šŸ”§ **Production Ready** - Environment-based config, Docker deployment, health monitoring -- šŸ“Š **Comprehensive Statistics** - Runtime metrics, content processing, and federation growth tracking -- 🐳 **Containerized** - Simple Docker Compose deployment -- šŸ“ **File-based Configuration** - Easy RSS feed management via text files +- **šŸ“” Multi-Instance Discovery** - Fetches content from configurable RSS feeds across Fediverse instances +- **⚔ Performance Scaling** - 20-100 posts per feed with URL parameters (`?limit=100`) +- **🐳 Production Ready** - Docker deployment, environment-based config, health monitoring +- **šŸ“Š Comprehensive Stats** - Runtime metrics, federation growth, performance tracking +- **šŸ”§ Zero Maintenance** - Runs automatically every hour with duplicate detection -## How it Works - -**GTS-HolMirDas** reads RSS feeds from various Fediverse instances and uses GoToSocial's search API to federate the discovered content. This approach: - -- Maintains proper ActivityPub federation (posts remain interactive) -- Respects rate limits and instance policies -- Provides better content discovery for small instances -- Works alongside tools like FediFetcher for comprehensive federation - -## Quick Start +## šŸš€ Quick Start ```bash # Clone the repository @@ -36,8 +26,8 @@ cp .env.example .env cp rss_feeds.example.txt rss_feeds.txt # Edit configuration -nano .env # Add your GTS credentials -nano rss_feeds.txt # Customize RSS feeds +nano .env # Add your GTS credentials +nano rss_feeds.txt # Customize RSS feeds # Deploy docker compose up -d @@ -45,261 +35,78 @@ docker compose up -d # Monitor docker compose logs -f ``` -# Performance Scaling & Configuration -## šŸš€ RSS Feed Optimization (v1.1.0+) - -GTS-HolMirDas supports URL parameters to dramatically increase content discovery without additional API calls. - -### RSS Feed Limits - -Most Mastodon-compatible instances support the `?limit=X` parameter: +## šŸ“ˆ Performance at Scale +**Real Production Data:** ``` -# Default behavior (20 posts per feed) -https://mastodon.social/tags/homelab.rss - -# Increased limits (up to 100 posts per feed) -https://mastodon.social/tags/homelab.rss?limit=50 -https://fosstodon.org/tags/docker.rss?limit=100 +šŸ“Š Runtime: 8:42 | 487 posts processed | 3,150+ instances discovered +⚔ 56 posts/minute | 102 RSS feeds | +45 new instances per run +šŸ’¾ Resource usage: ~450MB RAM total (GoToSocial + tools) ``` -**Supported limits:** 20 (default), 50, 75, 100 (instance-dependent) +**Scaling Options:** +- **Conservative:** 20 posts/feed (~100 posts/run) +- **Balanced:** 50 posts/feed (~300 posts/run) +- **Aggressive:** 100 posts/feed (~600 posts/run) -### Performance Impact - -| Configuration | Posts/Run | API Calls | Processing Time | -|---------------|-----------|-----------|-----------------| -| Standard (limit=20) | ~100 posts | 30+ feeds | 2-5 minutes | -| Optimized (limit=50) | ~300 posts | 30+ feeds | 5-10 minutes | -| Maximum (limit=100) | ~600 posts | 30+ feeds | 8-15 minutes | - -## āš™ļø Configuration Tuning - -### Environment Variables - -```env -# Processing Configuration -MAX_POSTS_PER_RUN=75 # Increase for higher limits -DELAY_BETWEEN_REQUESTS=1 # Balance speed vs. server load -RSS_URLS_FILE=/app/rss_feeds.txt - -# Recommended combinations: -# Conservative: MAX_POSTS_PER_RUN=40, limit=50 -# Balanced: MAX_POSTS_PER_RUN=75, limit=100 -# Aggressive: MAX_POSTS_PER_RUN=100, limit=100 -``` - -### RSS Feed Strategy - -``` -# Progressive scaling approach: -# 1. Start with mixed limits to test performance -# 2. Increase gradually based on server capacity -# 3. Monitor GoToSocial memory usage - -# Example progression: -https://mastodon.social/tags/homelab.rss?limit=50 -https://fosstodon.org/tags/selfhosting.rss?limit=75 -https://chaos.social/tags/docker.rss?limit=100 -``` - -## šŸ“Š Monitoring & Optimization - -### Performance Metrics - -The statistics output shows real-time performance: - -``` -šŸ“Š GTS-HolMirDas Run Statistics: - ā±ļø Runtime: 0:08:42 - šŸ“„ Total posts processed: 487 - 🌐 Current known instances: 3150 - āž• New instances discovered: +45 - šŸ“” RSS feeds processed: 102 - ⚔ Posts per minute: 56.0 -``` - -### Optimization Guidelines - -**Memory Usage:** -- Monitor GoToSocial memory consumption during runs -- Each 100 additional posts ā‰ˆ ~2-5MB additional RAM -- Recommended: 1GB+ RAM for aggressive configurations - -**Processing Time:** -- Scales linearly with `MAX_POSTS_PER_RUN Ɨ number_of_feeds` -- Duplicate detection becomes more important at scale -- Consider running frequency vs. content volume - -**Federation Growth:** -- Higher limits = more diverse instance discovery -- Expect 20-50+ new instances per optimized run -- Balance discovery rate with storage capacity - -### Troubleshooting High-Volume Setups - -**If processing takes too long:** -```env -MAX_POSTS_PER_RUN=50 # Reduce from 75/100 -DELAY_BETWEEN_REQUESTS=2 # Increase from 1 -``` - -**If GoToSocial uses too much memory:** -- Reduce RSS feed count temporarily -- Lower `?limit=` parameters to 50 instead of 100 -- Increase run frequency instead of volume - -**If duplicate detection is slow:** -- Storage cleanup: `docker-compose exec gts-holmirdas rm -f /app/data/processed_urls.json` -- This forces fresh state tracking (posts will be reprocessed once) - -## šŸŽÆ Best Practices - -### Scaling Strategy - -1. **Start Conservative:** `limit=50`, `MAX_POSTS_PER_RUN=40` -2. **Monitor Performance:** Check RAM usage and processing time -3. **Scale Gradually:** Increase to `limit=75`, then `limit=100` -4. **Optimize Mix:** Use different limits per instance based on quality - -### Instance Selection - -**High-quality instances for aggressive limits:** -``` -# Tech-focused instances (good signal-to-noise ratio) -https://fosstodon.org/tags/homelab.rss?limit=100 -https://infosec.exchange/tags/security.rss?limit=100 - -# General instances (moderate limits recommended) -https://mastodon.social/tags/technology.rss?limit=50 -``` - -**Performance tip:** Specialized instances often have higher content quality at scale than general-purpose instances. - -## Configuration +## šŸ› ļø Configuration Essentials ### Environment Variables (.env) - ```bash -# GTS Server Configuration +# Required GTS_SERVER_URL=https://your-gts-instance.tld GTS_ACCESS_TOKEN=your_gts_access_token -# Processing Configuration +# Performance Tuning MAX_POSTS_PER_RUN=25 # Posts per feed per run DELAY_BETWEEN_REQUESTS=1 # Seconds between API calls -LOG_LEVEL=INFO # Logging verbosity - -# RSS Configuration -RSS_URLS_FILE=/app/rss_feeds.txt # Path to RSS feeds file - -# Optional: Monitoring -HEALTHCHECK_URL=https://hc-ping.com/your-uuid +LOG_LEVEL=INFO # DEBUG for troubleshooting ``` ### RSS Feeds (rss_feeds.txt) - -``` -# Example RSS feeds - customize for your interests -# homelab -https://mastodon.social/tags/homelab.rss -https://fosstodon.org/tags/homelab.rss - -# selfhosting -https://mastodon.social/tags/selfhosting.rss -https://infosec.exchange/tags/selfhosting.rss - -# Add your preferred instances and hashtags +```bash +# Use URL parameters to scale performance +https://mastodon.social/tags/homelab.rss?limit=50 +https://fosstodon.org/tags/selfhosting.rss?limit=100 +https://infosec.exchange/tags/security.rss?limit=75 ``` -## Access Token Setup - +### GoToSocial Access Token 1. Login to your GoToSocial instance -2. Go to Settings → Applications -3. Create new application with scopes: `read`, `read:search`, `read:statuses` -4. Copy the access token to your `.env` file +2. Settings → Applications → Create new application +3. Required scopes: `read`, `read:search`, `read:statuses` +4. Copy access token to `.env` file -## Statistics Output +## šŸ“– Complete Documentation -``` -šŸ“Š GTS-HolMirDas Run Statistics: - ā±ļø Runtime: 0:04:14 - šŸ“„ Total posts processed: 45 - 🌐 Current known instances: 2519 - āž• New instances discovered: +3 - šŸ“” RSS feeds processed: 25 - ⚔ Posts per minute: 10.6 -``` +For detailed information, visit our **[Wiki](../../wiki)**: -## Resource Requirements +- **[šŸ“‹ Installation Guide](../../wiki/Installation-Guide)** - Detailed setup, Docker configuration, deployment options +- **[šŸ“ˆ Performance & Scaling](../../wiki/Performance-Scaling)** - Optimization tables, scaling strategies, resource planning +- **[šŸ› ļø Troubleshooting](../../wiki/Troubleshooting)** - Common issues, Docker problems, debugging guide +- **[āš™ļø Advanced Configuration](../../wiki/Advanced-Configuration)** - Environment variables, RSS strategies, production tips +- **[šŸ“Š Monitoring & Stats](../../wiki/Monitoring-Stats)** - Understanding output, health monitoring, metrics +- **[ā“ FAQ](../../wiki/FAQ)** - Common questions and answers -- **Memory**: ~200-500MB depending on feed count -- **CPU**: Minimal (mostly I/O bound) -- **Storage**: <100MB for application, plus log storage -- **Network**: Depends on RSS feed count and frequency +## šŸ¤ Community & Support -## Deployment Options +- **Issues & Bug Reports:** [Create an Issue](../../issues) +- **Feature Requests:** [Discussions](../../discussions) +- **Matrix:** [#gotosocial-space:superseriousbusiness.org](https://matrix.to/#/#gotosocial-space:superseriousbusiness.org) -### Docker Compose (Recommended) -```bash -docker compose up -d -``` +## šŸ”— Related Projects -### Standalone Docker -```bash -docker build -t gts-holmirdas . -docker run -d --env-file .env \ - -v ./data:/app/data \ - -v ./gts_holmirdas.py:/app/gts_holmirdas.py:ro \ - -v ./rss_feeds.txt:/app/rss_feeds.txt:ro \ - gts-holmirdas -``` +- **[FediFetcher](https://github.com/nanos/fedifetcher)** - Fetches missing replies and posts +- **[GoToSocial](https://github.com/superseriousbusiness/gotosocial)** - Lightweight ActivityPub server +- **[slurp](https://github.com/VyrCossont/slurp)** - Import posts from other instances -## Monitoring +## šŸ“„ License -- **Logs**: `docker compose logs -f` -- **Health**: Optional Healthchecks.io integration -- **Statistics**: Built-in runtime and performance metrics -- **Resource Usage**: Docker stats or container monitoring tools +MIT License - see [LICENSE](LICENSE) file for details. -## Troubleshooting +## šŸ™ Acknowledgments -### Common Issues - -- **No posts processed**: Check access token permissions and RSS feed URLs -- **Rate limiting errors**: Increase `DELAY_BETWEEN_REQUESTS` or reduce feed count -- **High memory usage**: Reduce `MAX_POSTS_PER_RUN` or feed frequency -- **Container won't start**: Verify `.env` file format and required variables - -### Debug Mode - -```bash -# Enable debug logging -echo "LOG_LEVEL=DEBUG" >> .env -docker compose restart gts-holmirdas -``` - -## Contributing - -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Test thoroughly -5. Submit a pull request - -## Related Projects - -- [FediFetcher](https://github.com/nanos/fedifetcher) - Fetches missing replies and posts -- [GoToSocial](https://github.com/superseriousbusiness/gotosocial) - Lightweight ActivityPub server -- [slurp](https://github.com/VyrCossont/slurp) - Import posts from other instances - -## License - -MIT License - see LICENSE file for details. - -## Acknowledgments - -- Inspired by [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://github.com/aliceif) ([@aliceif@mkultra.x27.one](https://mkultra.x27.one/@aliceif)) - the original RSS-to-ActivityPub concept +- Inspired by [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://mkultra.x27.one/@aliceif) - Built for the GoToSocial community -- RSS-to-ActivityPub approach inspired by Fediverse discovery challenges +- RSS-to-ActivityPub federation approach \ No newline at end of file From 4bd1d05d936cccd9f9b415b75c08c4b5f7a6f130 Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 20:46:12 +0000 Subject: [PATCH 16/24] Streamline README, move detailed docs to Wiki --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c182610..415a35b 100644 --- a/README.md +++ b/README.md @@ -82,12 +82,12 @@ https://infosec.exchange/tags/security.rss?limit=75 For detailed information, visit our **[Wiki](../../wiki)**: -- **[šŸ“‹ Installation Guide](../../wiki/Installation-Guide)** - Detailed setup, Docker configuration, deployment options -- **[šŸ“ˆ Performance & Scaling](../../wiki/Performance-Scaling)** - Optimization tables, scaling strategies, resource planning -- **[šŸ› ļø Troubleshooting](../../wiki/Troubleshooting)** - Common issues, Docker problems, debugging guide -- **[āš™ļø Advanced Configuration](../../wiki/Advanced-Configuration)** - Environment variables, RSS strategies, production tips -- **[šŸ“Š Monitoring & Stats](../../wiki/Monitoring-Stats)** - Understanding output, health monitoring, metrics -- **[ā“ FAQ](../../wiki/FAQ)** - Common questions and answers +- **[šŸ“‹ Installation Guide](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Installation-Guide)** - Detailed setup, Docker configuration, deployment options +- **[šŸ“ˆ Performance & Scaling](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Performance-%26-Scaling)** - Optimization tables, scaling strategies, resource planning +- **[šŸ› ļø Troubleshooting](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Troubleshooting)** - Common issues, Docker problems, debugging guide +- **[āš™ļø Advanced Configuration](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Advanced-Configuration)** - Environment variables, RSS strategies, production tips +- **[šŸ“Š Monitoring & Stats](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Monitoring-%26-Stats)** - Understanding output, health monitoring, metrics +- **[ā“ FAQ](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/FAQ+-+Frequently+Asked+Questions)** - Common questions and answers ## šŸ¤ Community & Support From c9545735ea16fb25915233e29ce0a9f17b9a4e11 Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 20:47:38 +0000 Subject: [PATCH 17/24] Streamline README, move detailed docs to Wiki --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 415a35b..4a0dffa 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ https://infosec.exchange/tags/security.rss?limit=75 ## šŸ“– Complete Documentation -For detailed information, visit our **[Wiki](../../wiki)**: +For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts-holmirdas/wiki)**: - **[šŸ“‹ Installation Guide](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Installation-Guide)** - Detailed setup, Docker configuration, deployment options - **[šŸ“ˆ Performance & Scaling](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Performance-%26-Scaling)** - Optimization tables, scaling strategies, resource planning From 228e3c8d51b8c7d8ebee712426dcdb97123120f7 Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 20:49:42 +0000 Subject: [PATCH 18/24] Streamline README, move detailed docs to Wiki --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4a0dffa..a080748 100644 --- a/README.md +++ b/README.md @@ -82,12 +82,12 @@ https://infosec.exchange/tags/security.rss?limit=75 For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts-holmirdas/wiki)**: -- **[šŸ“‹ Installation Guide](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Installation-Guide)** - Detailed setup, Docker configuration, deployment options +- **[šŸ“‹ Installation Guide](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Installation-Guide.-)** - Detailed setup, Docker configuration, deployment options - **[šŸ“ˆ Performance & Scaling](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Performance-%26-Scaling)** - Optimization tables, scaling strategies, resource planning - **[šŸ› ļø Troubleshooting](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Troubleshooting)** - Common issues, Docker problems, debugging guide - **[āš™ļø Advanced Configuration](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Advanced-Configuration)** - Environment variables, RSS strategies, production tips - **[šŸ“Š Monitoring & Stats](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Monitoring-%26-Stats)** - Understanding output, health monitoring, metrics -- **[ā“ FAQ](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/FAQ+-+Frequently+Asked+Questions)** - Common questions and answers +- **[ā“ FAQ](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/FAQ+-+Frequently+Asked+Questions.-)** - Common questions and answers ## šŸ¤ Community & Support From 8c000eea02afe69fbcb9c6ec8f21f1846a258e2b Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 21:43:23 +0000 Subject: [PATCH 19/24] Streamline README, move detailed docs to Wiki --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a080748..d6f81d4 100644 --- a/README.md +++ b/README.md @@ -91,9 +91,9 @@ For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts ## šŸ¤ Community & Support -- **Issues & Bug Reports:** [Create an Issue](../../issues) -- **Feature Requests:** [Discussions](../../discussions) -- **Matrix:** [#gotosocial-space:superseriousbusiness.org](https://matrix.to/#/#gotosocial-space:superseriousbusiness.org) +- **[Contributing Guide](Contributing)** - Development setup and contribution guidelines *(coming soon)* +- **Issues**: [Report bugs or request features](../issues) +- **Contact**: [@matthias@me.klein.ruhr](https://me.klein.ruhr/@matthias) on the Fediverse ## šŸ”— Related Projects From 298d11fc446226da3dce4f2b957688dc328c343c Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 21:44:03 +0000 Subject: [PATCH 20/24] Streamline README, move detailed docs to Wiki --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d6f81d4..6187636 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts ## šŸ¤ Community & Support - **[Contributing Guide](Contributing)** - Development setup and contribution guidelines *(coming soon)* -- **Issues**: [Report bugs or request features](../issues) +- **Issues**: [Report bugs or request features](issues) - **Contact**: [@matthias@me.klein.ruhr](https://me.klein.ruhr/@matthias) on the Fediverse ## šŸ”— Related Projects From c8dabb5c0e41cff0e61b1cc73f55224391fa8887 Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 21:45:58 +0000 Subject: [PATCH 21/24] Streamline README, move detailed docs to Wiki --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6187636..f58fd20 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts ## šŸ¤ Community & Support - **[Contributing Guide](Contributing)** - Development setup and contribution guidelines *(coming soon)* -- **Issues**: [Report bugs or request features](issues) +- **Issues**: [Report bugs or request features](Issues) - **Contact**: [@matthias@me.klein.ruhr](https://me.klein.ruhr/@matthias) on the Fediverse ## šŸ”— Related Projects From 0cbac9b31c7f826dcc58514fb6796c5fa8846c09 Mon Sep 17 00:00:00 2001 From: matthias Date: Sun, 3 Aug 2025 21:46:26 +0000 Subject: [PATCH 22/24] Streamline README, move detailed docs to Wiki --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f58fd20..b23f68e 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts ## šŸ¤ Community & Support - **[Contributing Guide](Contributing)** - Development setup and contribution guidelines *(coming soon)* -- **Issues**: [Report bugs or request features](Issues) +- **Issues**: [Report bugs or request features](https://git.klein.ruhr/matthias/gts-holmirdas/issues) - **Contact**: [@matthias@me.klein.ruhr](https://me.klein.ruhr/@matthias) on the Fediverse ## šŸ”— Related Projects From 750b425e337d90d8a93095db6e45c153ba6776ed Mon Sep 17 00:00:00 2001 From: matthias Date: Mon, 4 Aug 2025 10:15:06 +0200 Subject: [PATCH 23/24] Fix environment variable support for GTS_SERVER_URL --- gts_holmirdas.py | 436 ++++++++++++++++++++++++----------------------- 1 file changed, 223 insertions(+), 213 deletions(-) diff --git a/gts_holmirdas.py b/gts_holmirdas.py index 642695e..4b84f1d 100644 --- a/gts_holmirdas.py +++ b/gts_holmirdas.py @@ -1,118 +1,155 @@ #!/usr/bin/env python3 -""" -GTS-HolMirDas: RSS-based content discovery for GoToSocial - -Inspired by HolMirDas by @aliceif: -- GitHub: https://github.com/aliceif/HolMirDas -- Fediverse: @aliceif@mkultra.x27.one - -This GoToSocial adaptation extends the original RSS-to-ActivityPub concept -with Docker deployment, multi-instance processing, and comprehensive monitoring. -""" - import os -import sys import time -import json import logging import requests import feedparser -from datetime import timedelta -from urllib.parse import quote_plus +import json +from urllib.parse import urlparse +from datetime import datetime class GTSHolMirDas: def __init__(self): - """Initialize the RSS fetcher with configuration""" - self.config = { - "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), - "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), - "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), - "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), - "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), - "log_level": os.getenv("LOG_LEVEL", "INFO") - } + # Setup logging first + self.setup_logging() - # Setup logging FIRST + # Load configuration + self.config = self.load_config() + + # Setup rate limiting + self.last_request_time = 0 + self.min_delay = int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")) + + # Track processed URLs + self.data_dir = "/app/data" + self.processed_urls_file = os.path.join(self.data_dir, "processed_urls.json") + self.instances_file = os.path.join(self.data_dir, "known_instances.json") + + # Ensure data directory exists + os.makedirs(self.data_dir, exist_ok=True) + + # Load processed URLs and known instances + self.processed_urls = self.load_processed_urls() + self.known_instances = self.load_known_instances() + + def setup_logging(self): + """Setup logging configuration""" logging.basicConfig( - level=getattr(logging, self.config["log_level"]), - format='%(asctime)s - %(levelname)s - %(message)s' + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler()] ) self.logger = logging.getLogger(__name__) + + def load_config(self): + """Load configuration from environment variables or file""" + config = {} - # Load RSS URLs from file or environment + # RSS URLs - try file first, then environment variable rss_urls_file = os.getenv("RSS_URLS_FILE") if rss_urls_file and os.path.exists(rss_urls_file): - # Load from file try: - with open(rss_urls_file, 'r') as f: - self.config["rss_urls"] = [ - line.strip() for line in f - if line.strip() and not line.startswith('#') - ] - self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") + with open(rss_urls_file, 'r', encoding='utf-8') as f: + rss_urls = [] + for line_num, line in enumerate(f, 1): + line = line.strip() + # Skip empty lines and comment-only lines + if not line or line.startswith('#'): + continue + + # Handle inline comments: split at # and take first part + if '#' in line: + url_part = line.split('#')[0].strip() + else: + url_part = line + + # Validate URL format + if url_part and url_part.startswith('http'): + # Remove any remaining control characters + clean_url = ''.join(char for char in url_part if ord(char) >= 32) + rss_urls.append(clean_url) + self.logger.debug(f"Line {line_num}: Added URL: {clean_url}") + elif url_part: + self.logger.warning(f"Line {line_num}: Invalid URL format: {url_part}") + + self.logger.info(f"Loaded {len(rss_urls)} RSS URLs from file: {rss_urls_file}") except Exception as e: self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") - self.config["rss_urls"] = [] + rss_urls = [] else: # Fallback to environment variable - self.config["rss_urls"] = [ - url.strip() for url in os.getenv("RSS_URLS", "").split(",") - if url.strip() - ] - if self.config["rss_urls"]: - self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") + rss_urls_env = os.getenv("RSS_URLS", "") + rss_urls = [url.strip() for url in rss_urls_env.split(",") if url.strip()] + if rss_urls: + self.logger.info(f"Loaded {len(rss_urls)} RSS URLs from environment variable") + else: + self.logger.warning("No RSS URLs found in file or environment variable") + + config['rss_urls'] = rss_urls + config['gts_instance'] = os.getenv("GTS_SERVER_URL") or os.getenv("GTS_INSTANCE") + config['max_posts_per_run'] = int(os.getenv("MAX_POSTS_PER_RUN", "10")) - # Load processed URLs from persistent storage - self.processed_urls_file = "/app/data/processed_urls.json" - self.processed_urls = self.load_processed_urls() - - # Statistics tracking - self.previous_instances = getattr(self, 'previous_instances', 0) + return config def load_processed_urls(self): - """Load previously processed URLs and instance count from file""" + """Load processed URLs from file""" try: if os.path.exists(self.processed_urls_file): with open(self.processed_urls_file, 'r') as f: - data = json.load(f) - # Load previous instance count for statistics - self.previous_instances = data.get('previous_instances', 0) - return set(data.get('processed_urls', [])) + return set(json.load(f)) except Exception as e: self.logger.warning(f"Could not load processed URLs: {e}") - return set() - def save_processed_urls(self, current_instances=None): - """Save processed URLs and current instance count to file""" + def save_processed_urls(self): + """Save processed URLs to file""" try: - os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) - data = { - 'processed_urls': list(self.processed_urls), - 'last_updated': time.time() - } - # Save current instance count for next run - if current_instances is not None and current_instances != 'unknown': - data['previous_instances'] = current_instances - with open(self.processed_urls_file, 'w') as f: - json.dump(data, f, indent=2) + json.dump(list(self.processed_urls), f) except Exception as e: self.logger.error(f"Could not save processed URLs: {e}") - def fetch_rss_urls(self, rss_url): - """Fetch URLs from RSS feed""" + def load_known_instances(self): + """Load known instances from file""" try: - self.logger.info(f"Fetching RSS feed: {rss_url}") + if os.path.exists(self.instances_file): + with open(self.instances_file, 'r') as f: + return set(json.load(f)) + except Exception as e: + self.logger.warning(f"Could not load known instances: {e}") + return set() + + def save_known_instances(self): + """Save known instances to file""" + try: + with open(self.instances_file, 'w') as f: + json.dump(list(self.known_instances), f) + except Exception as e: + self.logger.error(f"Could not save known instances: {e}") + + def rate_limit(self): + """Implement rate limiting between requests""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < self.min_delay: + sleep_time = self.min_delay - time_since_last + time.sleep(sleep_time) + + self.last_request_time = time.time() + + def fetch_rss_feed(self, url): + """Fetch and parse RSS feed""" + try: + self.logger.info(f"Fetching RSS feed: {url}") + self.rate_limit() - # Parse RSS feed - feed = feedparser.parse(rss_url) + response = requests.get(url, timeout=30) + response.raise_for_status() - if feed.bozo: - self.logger.warning(f"RSS feed may have issues: {rss_url}") - - # Extract URLs from entries + feed = feedparser.parse(response.content) urls = [] + for entry in feed.entries: if hasattr(entry, 'link'): urls.append(entry.link) @@ -121,161 +158,134 @@ class GTSHolMirDas: return urls except Exception as e: - self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") + self.logger.error(f"Error fetching RSS feed {url}: {e}") return [] - def lookup_post(self, post_url): - """Look up a post URL using GTS search API""" - try: - # Prepare search API call - search_url = f"{self.config['server_url']}/api/v2/search" - params = { - 'q': post_url, - 'type': 'statuses', - 'resolve': 'true', - 'limit': 1 - } - headers = { - 'Authorization': f'Bearer {self.config["access_token"]}', - 'Content-Type': 'application/json' - } + def lookup_post(self, url): + """Lookup a post URL on the GTS instance""" + if not self.config['gts_instance']: + self.logger.warning("No GTS instance configured") + return False - # Make API call - response = requests.get( - search_url, - params=params, - headers=headers, - timeout=30 - ) + try: + lookup_url = f"{self.config['gts_instance']}/api/v1/statuses/lookup" + params = {"uri": url} + + self.rate_limit() + response = requests.get(lookup_url, params=params, timeout=10) if response.status_code == 200: - results = response.json() - if results.get('statuses') or results.get('accounts'): - self.logger.info(f"Successfully looked up: {post_url}") - return True - else: - self.logger.warning(f"No results for: {post_url}") - return False - else: - self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") + self.logger.info(f"Successfully looked up: {url}") + # Extract and store instance info + parsed_url = urlparse(url) + instance = f"{parsed_url.scheme}://{parsed_url.netloc}" + self.known_instances.add(instance) + return True + elif response.status_code == 404: + self.logger.warning(f"No results for: {url}") return False - - except requests.exceptions.RequestException as e: - self.logger.error(f"Error looking up {post_url}: {e}") + else: + self.logger.warning(f"Lookup failed for {url}: {response.status_code}") + return False + + except Exception as e: + self.logger.error(f"Error looking up {url}: {e}") return False - def process_feeds(self): - """Process all configured RSS feeds""" - total_processed = 0 - - # Record start time for statistics - self.start_time = time.time() - - # Ping healthcheck start - self.ping_healthcheck("/start") - + def get_instance_count(self): + """Get current instance count from API""" + if not self.config['gts_instance']: + return "unknown" + try: - for rss_url in self.config["rss_urls"]: - if not rss_url.strip(): - continue - - self.logger.info(f"Processing feed: {rss_url}") - - # Get URLs from RSS - urls = self.fetch_rss_urls(rss_url) - - # Filter out already processed URLs - new_urls = [url for url in urls if url not in self.processed_urls] - - if not new_urls: - self.logger.info("No new URLs to process") - continue - - # Rate limiting: max posts per run - urls_to_process = new_urls[:self.config["max_posts_per_run"]] - - self.logger.info(f"Processing {len(urls_to_process)} new URLs") - - for url in urls_to_process: - if self.lookup_post(url): - self.processed_urls.add(url) - total_processed += 1 - - # Rate limiting: delay between requests - time.sleep(self.config["delay_between_requests"]) - - # Calculate runtime - end_time = time.time() - runtime_seconds = end_time - self.start_time - runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) - - # Get current instance count - try: - instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", - headers={'Authorization': f'Bearer {self.config["access_token"]}'}, - timeout=10) - if instance_info.status_code == 200: - current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') - else: - current_instances = 'unknown' - except Exception as e: - self.logger.error(f"Failed to get instance count: {e}") - current_instances = 'unknown' - - # Calculate new instances (if we have previous data) - new_instances = 'unknown' - if self.previous_instances > 0 and current_instances != 'unknown': - new_instances = current_instances - self.previous_instances - - # Print comprehensive statistics - print(f"\nšŸ“Š GTS-HolMirDas Run Statistics:") - print(f" ā±ļø Runtime: {runtime_formatted}") - print(f" šŸ“„ Total posts processed: {total_processed}") - print(f" 🌐 Current known instances: {current_instances}") - if new_instances != 'unknown' and new_instances > 0: - print(f" āž• New instances discovered: +{new_instances}") - elif new_instances == 0: - print(f" āž• New instances discovered: +0") - print(f" šŸ“” RSS feeds processed: {len(self.config['rss_urls'])}") - if runtime_seconds > 60: - print(f" ⚔ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") - - self.save_processed_urls(current_instances) - - # Ping healthcheck success - self.ping_healthcheck("") - + api_url = f"{self.config['gts_instance']}/api/v1/instance" + response = requests.get(api_url, timeout=10) + if response.status_code == 200: + return len(self.known_instances) except Exception as e: - self.logger.error(f"Error during processing: {e}") - # Ping healthcheck failure - self.ping_healthcheck("/fail") + self.logger.error(f"Failed to get instance count: {e}") + + return "unknown" + + def process_feeds(self): + """Process all RSS feeds""" + total_posts = 0 + feeds_processed = 0 + initial_instance_count = len(self.known_instances) + + for rss_url in self.config['rss_urls']: + self.logger.info(f"Processing feed: {rss_url}") + + # Fetch RSS feed + urls = self.fetch_rss_feed(rss_url) + + # Filter new URLs + new_urls = [url for url in urls if url not in self.processed_urls] + + if not new_urls: + self.logger.info("No new URLs to process") + continue + + # Limit posts per run + limited_urls = new_urls[:self.config['max_posts_per_run']] + self.logger.info(f"Processing {len(limited_urls)} new URLs") + + # Process each URL + for url in limited_urls[:self.config['max_posts_per_run']]: + if self.lookup_post(url): + total_posts += 1 + + # Mark as processed regardless of success + self.processed_urls.add(url) + + feeds_processed += 1 + + new_instances = len(self.known_instances) - initial_instance_count + + return { + 'total_posts': total_posts, + 'feeds_processed': feeds_processed, + 'instance_count': len(self.known_instances), + 'new_instances': new_instances + } + + def run(self): + """Main execution method""" + start_time = datetime.now() + + try: + stats = self.process_feeds() + + # Save state + self.save_processed_urls() + self.save_known_instances() + + # Calculate runtime + runtime = datetime.now() - start_time + + # Print summary statistics + print("\nšŸ“Š GTS-HolMirDas Run Statistics:") + print(f" ā±ļø Runtime: {runtime}") + print(f" šŸ“„ Total posts processed: {stats['total_posts']}") + print(f" 🌐 Current known instances: {stats['instance_count']}") + print(f" āž• New instances discovered: +{stats['new_instances']}") + print(f" šŸ“” RSS feeds processed: {stats['feeds_processed']}") + + except Exception as e: + self.logger.error(f"Fatal error: {e}") raise - def ping_healthcheck(self, endpoint=""): - """Ping healthchecks.io for monitoring""" - if not self.config.get("healthcheck_url"): - return - - try: - url = self.config["healthcheck_url"] + endpoint - requests.get(url, timeout=10) - except Exception as e: - self.logger.warning(f"Failed to ping healthcheck: {e}") - def main(): - """Main entry point""" + """Main function""" try: + print("Starting GTS-HolMirDas run...") fetcher = GTSHolMirDas() - - # Validate required config - if not fetcher.config["access_token"]: - raise ValueError("GTS_ACCESS_TOKEN environment variable is required") - - fetcher.process_feeds() - + fetcher.run() + print("GTS-HolMirDas run completed. Sleeping for 1 hour...") + except Exception as e: logging.error(f"Fatal error: {e}") raise if __name__ == "__main__": - main() + main() \ No newline at end of file From d2601cd83f730ea6af0d5b54509a5afa352b78a7 Mon Sep 17 00:00:00 2001 From: matthias Date: Mon, 4 Aug 2025 11:19:38 +0200 Subject: [PATCH 24/24] Fix inline comment parsing in RSS feeds file - Fixed control character errors when using inline comments in rss_feeds.txt - Comments after # are now properly stripped from RSS URLs - Minimal fix using split('#', 1)[0].strip() approach --- gts_holmirdas.py | 422 +++++++++++++++++++++++------------------------ 1 file changed, 206 insertions(+), 216 deletions(-) diff --git a/gts_holmirdas.py b/gts_holmirdas.py index 4b84f1d..77d1eaf 100644 --- a/gts_holmirdas.py +++ b/gts_holmirdas.py @@ -1,155 +1,118 @@ #!/usr/bin/env python3 +""" +GTS-HolMirDas: RSS-based content discovery for GoToSocial + +Inspired by HolMirDas by @aliceif: +- GitHub: https://github.com/aliceif/HolMirDas +- Fediverse: @aliceif@mkultra.x27.one + +This GoToSocial adaptation extends the original RSS-to-ActivityPub concept +with Docker deployment, multi-instance processing, and comprehensive monitoring. +""" + import os +import sys import time +import json import logging import requests import feedparser -import json -from urllib.parse import urlparse -from datetime import datetime +from datetime import timedelta +from urllib.parse import quote_plus class GTSHolMirDas: def __init__(self): - # Setup logging first - self.setup_logging() + """Initialize the RSS fetcher with configuration""" + self.config = { + "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), + "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), + "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), + "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), + "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), + "log_level": os.getenv("LOG_LEVEL", "INFO") + } - # Load configuration - self.config = self.load_config() - - # Setup rate limiting - self.last_request_time = 0 - self.min_delay = int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")) - - # Track processed URLs - self.data_dir = "/app/data" - self.processed_urls_file = os.path.join(self.data_dir, "processed_urls.json") - self.instances_file = os.path.join(self.data_dir, "known_instances.json") - - # Ensure data directory exists - os.makedirs(self.data_dir, exist_ok=True) - - # Load processed URLs and known instances - self.processed_urls = self.load_processed_urls() - self.known_instances = self.load_known_instances() - - def setup_logging(self): - """Setup logging configuration""" + # Setup logging FIRST logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler()] + level=getattr(logging, self.config["log_level"]), + format='%(asctime)s - %(levelname)s - %(message)s' ) self.logger = logging.getLogger(__name__) - - def load_config(self): - """Load configuration from environment variables or file""" - config = {} - # RSS URLs - try file first, then environment variable + # Load RSS URLs from file or environment rss_urls_file = os.getenv("RSS_URLS_FILE") if rss_urls_file and os.path.exists(rss_urls_file): + # Load from file try: - with open(rss_urls_file, 'r', encoding='utf-8') as f: - rss_urls = [] - for line_num, line in enumerate(f, 1): - line = line.strip() - # Skip empty lines and comment-only lines - if not line or line.startswith('#'): - continue - - # Handle inline comments: split at # and take first part - if '#' in line: - url_part = line.split('#')[0].strip() - else: - url_part = line - - # Validate URL format - if url_part and url_part.startswith('http'): - # Remove any remaining control characters - clean_url = ''.join(char for char in url_part if ord(char) >= 32) - rss_urls.append(clean_url) - self.logger.debug(f"Line {line_num}: Added URL: {clean_url}") - elif url_part: - self.logger.warning(f"Line {line_num}: Invalid URL format: {url_part}") - - self.logger.info(f"Loaded {len(rss_urls)} RSS URLs from file: {rss_urls_file}") + with open(rss_urls_file, 'r') as f: + self.config["rss_urls"] = [ + line.split('#', 1)[0].strip() for line in f + if line.strip() and not line.strip().startswith('#') + ] + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") except Exception as e: self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") - rss_urls = [] + self.config["rss_urls"] = [] else: # Fallback to environment variable - rss_urls_env = os.getenv("RSS_URLS", "") - rss_urls = [url.strip() for url in rss_urls_env.split(",") if url.strip()] - if rss_urls: - self.logger.info(f"Loaded {len(rss_urls)} RSS URLs from environment variable") - else: - self.logger.warning("No RSS URLs found in file or environment variable") - - config['rss_urls'] = rss_urls - config['gts_instance'] = os.getenv("GTS_SERVER_URL") or os.getenv("GTS_INSTANCE") - config['max_posts_per_run'] = int(os.getenv("MAX_POSTS_PER_RUN", "10")) + self.config["rss_urls"] = [ + url.strip() for url in os.getenv("RSS_URLS", "").split(",") + if url.strip() + ] + if self.config["rss_urls"]: + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") - return config + # Load processed URLs from persistent storage + self.processed_urls_file = "/app/data/processed_urls.json" + self.processed_urls = self.load_processed_urls() + + # Statistics tracking + self.previous_instances = getattr(self, 'previous_instances', 0) def load_processed_urls(self): - """Load processed URLs from file""" + """Load previously processed URLs and instance count from file""" try: if os.path.exists(self.processed_urls_file): with open(self.processed_urls_file, 'r') as f: - return set(json.load(f)) + data = json.load(f) + # Load previous instance count for statistics + self.previous_instances = data.get('previous_instances', 0) + return set(data.get('processed_urls', [])) except Exception as e: self.logger.warning(f"Could not load processed URLs: {e}") + return set() - def save_processed_urls(self): - """Save processed URLs to file""" + def save_processed_urls(self, current_instances=None): + """Save processed URLs and current instance count to file""" try: + os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) + data = { + 'processed_urls': list(self.processed_urls), + 'last_updated': time.time() + } + # Save current instance count for next run + if current_instances is not None and current_instances != 'unknown': + data['previous_instances'] = current_instances + with open(self.processed_urls_file, 'w') as f: - json.dump(list(self.processed_urls), f) + json.dump(data, f, indent=2) except Exception as e: self.logger.error(f"Could not save processed URLs: {e}") - def load_known_instances(self): - """Load known instances from file""" + def fetch_rss_urls(self, rss_url): + """Fetch URLs from RSS feed""" try: - if os.path.exists(self.instances_file): - with open(self.instances_file, 'r') as f: - return set(json.load(f)) - except Exception as e: - self.logger.warning(f"Could not load known instances: {e}") - return set() - - def save_known_instances(self): - """Save known instances to file""" - try: - with open(self.instances_file, 'w') as f: - json.dump(list(self.known_instances), f) - except Exception as e: - self.logger.error(f"Could not save known instances: {e}") - - def rate_limit(self): - """Implement rate limiting between requests""" - current_time = time.time() - time_since_last = current_time - self.last_request_time - - if time_since_last < self.min_delay: - sleep_time = self.min_delay - time_since_last - time.sleep(sleep_time) - - self.last_request_time = time.time() - - def fetch_rss_feed(self, url): - """Fetch and parse RSS feed""" - try: - self.logger.info(f"Fetching RSS feed: {url}") - self.rate_limit() + self.logger.info(f"Fetching RSS feed: {rss_url}") - response = requests.get(url, timeout=30) - response.raise_for_status() + # Parse RSS feed + feed = feedparser.parse(rss_url) - feed = feedparser.parse(response.content) + if feed.bozo: + self.logger.warning(f"RSS feed may have issues: {rss_url}") + + # Extract URLs from entries urls = [] - for entry in feed.entries: if hasattr(entry, 'link'): urls.append(entry.link) @@ -158,134 +121,161 @@ class GTSHolMirDas: return urls except Exception as e: - self.logger.error(f"Error fetching RSS feed {url}: {e}") + self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") return [] - def lookup_post(self, url): - """Lookup a post URL on the GTS instance""" - if not self.config['gts_instance']: - self.logger.warning("No GTS instance configured") - return False - + def lookup_post(self, post_url): + """Look up a post URL using GTS search API""" try: - lookup_url = f"{self.config['gts_instance']}/api/v1/statuses/lookup" - params = {"uri": url} + # Prepare search API call + search_url = f"{self.config['server_url']}/api/v2/search" + params = { + 'q': post_url, + 'type': 'statuses', + 'resolve': 'true', + 'limit': 1 + } + headers = { + 'Authorization': f'Bearer {self.config["access_token"]}', + 'Content-Type': 'application/json' + } - self.rate_limit() - response = requests.get(lookup_url, params=params, timeout=10) + # Make API call + response = requests.get( + search_url, + params=params, + headers=headers, + timeout=30 + ) if response.status_code == 200: - self.logger.info(f"Successfully looked up: {url}") - # Extract and store instance info - parsed_url = urlparse(url) - instance = f"{parsed_url.scheme}://{parsed_url.netloc}" - self.known_instances.add(instance) - return True - elif response.status_code == 404: - self.logger.warning(f"No results for: {url}") - return False + results = response.json() + if results.get('statuses') or results.get('accounts'): + self.logger.info(f"Successfully looked up: {post_url}") + return True + else: + self.logger.warning(f"No results for: {post_url}") + return False else: - self.logger.warning(f"Lookup failed for {url}: {response.status_code}") + self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") return False - - except Exception as e: - self.logger.error(f"Error looking up {url}: {e}") - return False - def get_instance_count(self): - """Get current instance count from API""" - if not self.config['gts_instance']: - return "unknown" - - try: - api_url = f"{self.config['gts_instance']}/api/v1/instance" - response = requests.get(api_url, timeout=10) - if response.status_code == 200: - return len(self.known_instances) - except Exception as e: - self.logger.error(f"Failed to get instance count: {e}") - - return "unknown" + except requests.exceptions.RequestException as e: + self.logger.error(f"Error looking up {post_url}: {e}") + return False def process_feeds(self): - """Process all RSS feeds""" - total_posts = 0 - feeds_processed = 0 - initial_instance_count = len(self.known_instances) - - for rss_url in self.config['rss_urls']: - self.logger.info(f"Processing feed: {rss_url}") - - # Fetch RSS feed - urls = self.fetch_rss_feed(rss_url) - - # Filter new URLs - new_urls = [url for url in urls if url not in self.processed_urls] - - if not new_urls: - self.logger.info("No new URLs to process") - continue - - # Limit posts per run - limited_urls = new_urls[:self.config['max_posts_per_run']] - self.logger.info(f"Processing {len(limited_urls)} new URLs") - - # Process each URL - for url in limited_urls[:self.config['max_posts_per_run']]: - if self.lookup_post(url): - total_posts += 1 - - # Mark as processed regardless of success - self.processed_urls.add(url) - - feeds_processed += 1 - - new_instances = len(self.known_instances) - initial_instance_count - - return { - 'total_posts': total_posts, - 'feeds_processed': feeds_processed, - 'instance_count': len(self.known_instances), - 'new_instances': new_instances - } + """Process all configured RSS feeds""" + total_processed = 0 + + # Record start time for statistics + self.start_time = time.time() + + # Ping healthcheck start + self.ping_healthcheck("/start") - def run(self): - """Main execution method""" - start_time = datetime.now() - try: - stats = self.process_feeds() - - # Save state - self.save_processed_urls() - self.save_known_instances() - + for rss_url in self.config["rss_urls"]: + if not rss_url.strip(): + continue + + self.logger.info(f"Processing feed: {rss_url}") + + # Get URLs from RSS + urls = self.fetch_rss_urls(rss_url) + + # Filter out already processed URLs + new_urls = [url for url in urls if url not in self.processed_urls] + + if not new_urls: + self.logger.info("No new URLs to process") + continue + + # Rate limiting: max posts per run + urls_to_process = new_urls[:self.config["max_posts_per_run"]] + + self.logger.info(f"Processing {len(urls_to_process)} new URLs") + + for url in urls_to_process: + if self.lookup_post(url): + self.processed_urls.add(url) + total_processed += 1 + + # Rate limiting: delay between requests + time.sleep(self.config["delay_between_requests"]) + # Calculate runtime - runtime = datetime.now() - start_time + end_time = time.time() + runtime_seconds = end_time - self.start_time + runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) - # Print summary statistics - print("\nšŸ“Š GTS-HolMirDas Run Statistics:") - print(f" ā±ļø Runtime: {runtime}") - print(f" šŸ“„ Total posts processed: {stats['total_posts']}") - print(f" 🌐 Current known instances: {stats['instance_count']}") - print(f" āž• New instances discovered: +{stats['new_instances']}") - print(f" šŸ“” RSS feeds processed: {stats['feeds_processed']}") + # Get current instance count + try: + instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", + headers={'Authorization': f'Bearer {self.config["access_token"]}'}, + timeout=10) + if instance_info.status_code == 200: + current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') + else: + current_instances = 'unknown' + except Exception as e: + self.logger.error(f"Failed to get instance count: {e}") + current_instances = 'unknown' + # Calculate new instances (if we have previous data) + new_instances = 'unknown' + if self.previous_instances > 0 and current_instances != 'unknown': + new_instances = current_instances - self.previous_instances + + # Print comprehensive statistics + print(f"\nšŸ“Š GTS-HolMirDas Run Statistics:") + print(f" ā±ļø Runtime: {runtime_formatted}") + print(f" šŸ“„ Total posts processed: {total_processed}") + print(f" 🌐 Current known instances: {current_instances}") + if new_instances != 'unknown' and new_instances > 0: + print(f" āž• New instances discovered: +{new_instances}") + elif new_instances == 0: + print(f" āž• New instances discovered: +0") + print(f" šŸ“” RSS feeds processed: {len(self.config['rss_urls'])}") + if runtime_seconds > 60: + print(f" ⚔ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") + + self.save_processed_urls(current_instances) + + # Ping healthcheck success + self.ping_healthcheck("") + except Exception as e: - self.logger.error(f"Fatal error: {e}") + self.logger.error(f"Error during processing: {e}") + # Ping healthcheck failure + self.ping_healthcheck("/fail") raise + def ping_healthcheck(self, endpoint=""): + """Ping healthchecks.io for monitoring""" + if not self.config.get("healthcheck_url"): + return + + try: + url = self.config["healthcheck_url"] + endpoint + requests.get(url, timeout=10) + except Exception as e: + self.logger.warning(f"Failed to ping healthcheck: {e}") + def main(): - """Main function""" + """Main entry point""" try: - print("Starting GTS-HolMirDas run...") fetcher = GTSHolMirDas() - fetcher.run() - print("GTS-HolMirDas run completed. Sleeping for 1 hour...") - + + # Validate required config + if not fetcher.config["access_token"]: + raise ValueError("GTS_ACCESS_TOKEN environment variable is required") + + fetcher.process_feeds() + except Exception as e: logging.error(f"Fatal error: {e}") raise if __name__ == "__main__": - main() \ No newline at end of file + main()