๐Ÿ’ป fetch_a16z_posts.py

python ยท 70 lines ยท โฌ‡๏ธ Download

import json, urllib.request, urllib.parse, base64, time

# Algolia credentials from the page
APP_ID = "FLQ5EG9WBF"
INDEX = "v2_production_searchable_posts"

# First get the API key from the endpoint
key_endpoint = "https://ohhu1ogx2a.execute-api.us-west-1.amazonaws.com/prod/api/generate-key"
req = urllib.request.Request(key_endpoint)
try:
    resp = urllib.request.urlopen(req, timeout=10)
    key_data = json.loads(resp.read())
    API_KEY = key_data.get('key', '')
    print(f"Got API key: {API_KEY[:20]}...")
except Exception as e:
    print(f"Key fetch error: {e}")
    # Try decoding from the base64 key we found
    API_KEY = ""

# One year ago timestamp
one_year_ago = int(time.time()) - 365 * 24 * 3600

all_posts = []
page = 0
while True:
    url = f"https://FLQ5EG9WBF-dsn.algolia.net/1/indexes/{INDEX}/query"
    payload = json.dumps({
        "query": "",
        "page": page,
        "hitsPerPage": 100,
        "filters": f"date_timestamp > {one_year_ago}",
    }).encode()
    
    req = urllib.request.Request(url, data=payload, method="POST", headers={
        "Content-Type": "application/json",
        "X-Algolia-Application-Id": APP_ID,
        "X-Algolia-API-Key": API_KEY,
    })
    
    try:
        resp = urllib.request.urlopen(req, timeout=15)
        data = json.loads(resp.read())
        hits = data.get('hits', [])
        total_pages = data.get('nbPages', 0)
        total_hits = data.get('nbHits', 0)
        print(f"Page {page}: {len(hits)} hits, total: {total_hits}, pages: {total_pages}")
        
        for hit in hits:
            post = {
                'title': hit.get('title', ''),
                'url': 'https://a16zcrypto.com' + hit.get('uri', hit.get('slug', '')),
                'date': hit.get('date', ''),
                'tags': hit.get('tags', []),
                'focus_areas': hit.get('focus_areas', []),
                'type': hit.get('type', ''),
            }
            all_posts.append(post)
        
        if page >= total_pages - 1:
            break
        page += 1
    except Exception as e:
        print(f"Error on page {page}: {e}")
        break

print(f"\nTotal posts collected: {len(all_posts)}")
with open('a16z_posts.json', 'w') as f:
    json.dump(all_posts, f, indent=2)
print("Saved to a16z_posts.json")