Pythonでニュースを取得してみた

ソースコード

import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import re
from urllib.parse import quote

def get_japanese_ai_news_google_rss():
    """Google News RSSで日本語AIニュースを取得"""
    print("🚀 Google News RSSで日本語AIニュースを取得開始...\n")
    
    # 複数の検索条件でニュースを取得
    search_queries = [
        'AI OR ChatGPT OR "生成AI"',
        '"人工知能" OR OpenAI OR Claude',
        '"機械学習" OR "ディープラーニング"',
        'ChatGPT OR Gemini OR "AI技術"'
    ]
    
    all_articles = []
    base_url = "https://news.google.com/rss/search"
    
    for query in search_queries:
        print(f"🔄 検索中: {query}")
        
        # URLパラメータの設定
        params = {
            'q': query,
            'hl': 'ja',      # 日本語
            'gl': 'JP',      # 日本
            'ceid': 'JP:ja', # 日本の日本語ニュース
            'when': '7d'     # 過去7日間
        }
        
        # URL構築（特殊文字の処理）
        param_string = '&'.join([f"{k}={quote(str(v))}" for k, v in params.items()])
        full_url = f"{base_url}?{param_string}"
        
        try:
            response = requests.get(full_url, timeout=30)
            
            if response.status_code == 200:
                # XMLパース
                root = ET.fromstring(response.content)
                
                # RSS項目を取得
                items = root.findall('.//item')
                
                for item in items[:8]:  # 各クエリから8件まで
                    title_elem = item.find('title')
                    link_elem = item.find('link')
                    pub_date_elem = item.find('pubDate')
                    description_elem = item.find('description')
                    source_elem = item.find('source')
                    
                    if title_elem is not None:
                        title = title_elem.text
                        link = link_elem.text if link_elem is not None else ''
                        pub_date = pub_date_elem.text if pub_date_elem is not None else ''
                        description = description_elem.text if description_elem is not None else ''
                        source = source_elem.text if source_elem is not None else 'Google News'
                        
                        # HTMLタグを除去
                        if description:
                            description = re.sub(r'<[^>]+>', '', description).strip()
                        
                        # 重複チェック（タイトルベース）
                        if not any(article['title'] == title for article in all_articles):
                            all_articles.append({
                                'title': title,
                                'description': description,
                                'source': source,
                                'url': link,
                                'pub_date': pub_date,
                                'query': query
                            })
                
                print(f"✅ {len(items)}件の記事を発見（重複除去前）")
                
            else:
                print(f"❌ HTTPエラー: {response.status_code}")
                
        except Exception as e:
            print(f"❌ {query}の取得エラー: {e}")
            continue
    
    # 結果の表示
    if all_articles:
        print(f"\n🎉 合計{len(all_articles)}件の日本語AIニュースを取得成功！\n")
        
        # 日付順にソート
        try:
            all_articles.sort(key=lambda x: x['pub_date'], reverse=True)
        except:
            pass
        
        for i, article in enumerate(all_articles[:10], 1):  # 上位10件表示
            title = article['title']
            description = article['description']
            source = article['source']
            url = article['url']
            pub_date = article['pub_date']
            
            # 日付フォーマット
            formatted_date = pub_date
            if pub_date:
                try:
                    # RFC 2822形式の日付をパース
                    dt = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S %Z')
                    formatted_date = dt.strftime('%Y年%m月%d日 %H:%M')
                except:
                    try:
                        # 別の形式を試行
                        dt = datetime.strptime(pub_date.split(' GMT')[0], '%a, %d %b %Y %H:%M:%S')
                        formatted_date = dt.strftime('%Y年%m月%d日 %H:%M')
                    except:
                        pass  # 元の形式を使用
            
            print(f"【AIニュース {i}】")
            print(f"📰 {title}")
            if description and len(description) > 20:
                # 概要の長さ調整
                if len(description) > 200:
                    description = description[:200] + '...'
                print(f"📝 {description}")
            print(f"📺 {source}")
            print(f"📅 {formatted_date}")
            print(f"🔗 {url}")
            print("=" * 80)
            print()
    else:
        print("❌ すべての検索クエリでニュース取得に失敗しました")
        print("💡 インターネット接続を確認してください")

# Google News RSSを実行
get_japanese_ai_news_google_rss()