User:Blippy1998Bot

Wikipedia editing bot run by Blippy1998 From Wikipedia, the free encyclopedia

As of February 9, 2026, this bot runs the below code every hour, on the hour, when my computer is turned on. The json that holds the most recent data is not included here, and neither are the log files. Obviously my user data isn't included, either; you need to set up authentication if you want to run this yourself, and you'd probably have to clear it with the admins.

# written mostly by AI

import requests
from bs4 import BeautifulSoup
import json
import os
import re
import datetime
import logging
import mwclient
from dotenv import load_dotenv


# --- CONFIGURATION ---
IS_DRY_RUN = False

HOUSE_SIZE = 435
URL_CLERK = "https://clerk.house.gov/"
URL_PRESS_GALLERY = "https://pressgallery.house.gov/member-data/party-breakdown"
DATA_FILE = "party_breakdown_data.json"
LOG_FILE = os.path.join("logs", f"scraper_{datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H-%M-%SZ')}.log")

# Wikipedia template pages to edit
WIKIPEDIA_REPUBLICAN_TEMPLATE = "Template:HouseRepublicanTally"
WIKIPEDIA_DEMOCRATIC_TEMPLATE = "Template:HouseDemocraticTally"

# User-Agent for Wikipedia and Congress websites, as requested.
USER_AGENT_WIKIPEDIA = "Blippy1998Bot/1.2.0 ([[User:Blippy1998]])"
USER_AGENT_CONGRESS = """Mozilla/5.0 Blippy1998Bot/1.2.0; scraping automatically on \
behalf of Wikipedia user Blippy1998 (https://en.wikipedia.org/wiki/User:Blippy1998) \
for the purpose of keeping membership tallies up to date on Wikipedia"""


# --- DO NOT EDIT THE FUNCTION BELOW ---
def get_party_breakdown_from_clerk():
    """
    Fetches and parses the party breakdown data from the House Clerk website.
    
    Returns:
        A dictionary with the party counts, e.g., {'Republicans': 219, 'Democrats': 212},
        or None if an error occurs.
    """
    try:
        # Send a request to the website
        page = requests.get(URL_CLERK, headers={'User-Agent': USER_AGENT_CONGRESS})
        page.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        
        # Parse the HTML content
        soup = BeautifulSoup(page.content, "html.parser")
        
        # Find the main container for the party numbers based on the new HTML structure
        container = soup.find("div", id="menu-member-stats")
        
        if not container:
            logging.critical("Could not find the party breakdown container div ('menu-member-stats') on the page.")
            return None
        
        # Extract the numbers from within the container
        dt_tags = container.find_all('dt')
        
        party_counts = {}
        for dt in dt_tags:
            # Get the label text (e.g., "Republicans") - we strip the number out first to be clean
            # The structure is <dt>Label<span>Number</span></dt>
            
            # 1. Get the number from the span
            number_span = dt.find('span')
            if not number_span:
                logging.critical(f"Could not find the number associated with this tag: {dt.get_text()}")
                return None
            
            count = int(number_span.text)
            
            # 2. Get the label by removing the span part from the full text
            # (This leaves "Republicans", "Democrats", etc.)
            label = dt.get_text().replace(number_span.text, '').strip()
            
            party_counts[label] = count
        
        # A sanity check to ensure we got something that looks right
        are_parties_in_results = "Republicans" in party_counts and "Democrats" in party_counts
        computed_number_of_seats = sum(party_counts.values())
        if are_parties_in_results and computed_number_of_seats == HOUSE_SIZE:
            return party_counts
        else:
            logging.error(f"The parsed data doesn't seem to contain the expected party information, or the computed seat count ({computed_number_of_seats}) is not equal to {HOUSE_SIZE}.")
            logging.error(f"Parsed data: {party_counts}")
            return None
    
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching the URL: {e}")
        return None
    except Exception as e:
        logging.error(f"An unexpected error occurred when parsing the House website: {e}", exc_info=True)
        return None


def get_party_breakdown_from_press_gallery():
    """
    Fetches and parses the party breakdown data from the House Press Gallery website.
    
    Returns:
        A dictionary with the party counts, e.g., {'Republicans': 219, 'Democrats': 212},
        or None if an error occurs.
    """
    try:
        # Send a request to the website
        page = requests.get(URL_PRESS_GALLERY, headers={'User-Agent': USER_AGENT_CONGRESS})
        page.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        
        # Parse the HTML content
        soup = BeautifulSoup(page.content, "html.parser")
        
        # Find the main container for the party numbers based on the new HTML structure
        container = soup.find("div", id="houseNumberCount")
        
        if not container:
            logging.error("Could not find the party breakdown container div ('houseNumberCount') on the page.")
            return None
        
        # Extract the numbers from the h5 tags within the container
        party_boxes = container.find_all("h5", class_="partyBox")
        
        party_counts = {}
        for box in party_boxes:
            party_name_tag = box.find("i", class_="party")
            party_count_tag = box.find("span", class_="num")
            
            if party_name_tag and party_count_tag:
                party_name = party_name_tag.text.strip()
                party_count_str = party_count_tag.text.strip().replace('*', '') # The vacancies number might have an asterisk, so we remove it
                party_counts[party_name] = int(party_count_str)
        
        # A sanity check to ensure we got something that looks right
        are_parties_in_results = "Republicans" in party_counts and "Democrats" in party_counts
        computed_number_of_seats = sum(party_counts.values())
        if are_parties_in_results and computed_number_of_seats == HOUSE_SIZE:
            return party_counts
        else:
            logging.error(f"The parsed data doesn't seem to contain the expected party information, or the computed seat count ({computed_number_of_seats}) is not equal to {HOUSE_SIZE}.")
            logging.error(f"Parsed data: {party_counts}")
            return None
    
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching the URL: {e}")
        return None
    except Exception as e:
        logging.error(f"An unexpected error occurred when parsing the House website: {e}", exc_info=True)
        return None


# lol did the AI comment this out and rewrite it?
# def split_tally_from_template_page(page) -> str:
#     """
#     Args
#         page (mwclient.Page) - the page object
#     """
#     text = page.text()
    
#     # assuming there is absolutely no text before the number
#     pattern = r"^\d{1,3}\b"
#     match = re.match(pattern, text.split("\n")[0])
#     if not match:
#         raise ValueError(f"{page.name} is not formatted as expected! text:\n{text}")
    
#     result = match.group()
    
#     return result, result.join(text.split(result)[1:])


def split_tally_from_template_page(page: mwclient.page.Page) -> tuple[str, str]:
    """
    Splits a page's content into the leading number (tally) and the rest of the text.
    
    Args:
        page (mwclient.page.Page): The page object to get text from.
    
    Returns:
        A tuple where the first item is the number string and the second is
        the remainder of the page's content.
        
    Raises:
        ValueError: If the page text does not start with a 1-3 digit number.
    """
    text = page.text()
    
    # This pattern finds a whole number of 1-3 digits at the start of the string
    # assuming there is absolutely no text before the number
    pattern = r"^\d{1,3}\b"
    match = re.match(pattern, text)
    
    if not match:
        raise ValueError(
            f"Page '{page.name}' is not formatted as expected!"
            f"The text does not start with a number that is at most 3 digits."
            f"text:"
            f"{text}"
        )
    
    # The matched number string (e.g., "219")
    tally = match.group()
    
    # The rest of the text, found by slicing from the end of the match
    remainder = text[match.end():]
    
    return tally, remainder


def update_wikipedia_template(site, string, page_title, new_tally) -> None:
    """
    Updates a single Wikipedia template page if the content has changed.
    
    Args:
        site (mwclient.Site): The site object to perform edits on.
        string (str): The name of the party (e.g., "Republican").
        page_title (str): The full title of the template to edit.
        new_tally (int): The new number to write to the page.
    """
    if new_tally is None:
        error_string = f"Could not find {string} tally in new data."
        logging.critical(error_string)
        # TODO: raise an exception maybe
        
        return None
    
    new_tally_str = str(new_tally)
    logging.info(f"Checking {string} tally on Wikipedia; the new number is {new_tally_str}.")
    
    page = site.pages[page_title]
    if not page.exists:
        error_string = f"{page_title} does not exist!"
        logging.critical(error_string)
        raise ValueError(error_string)
    
    # Good bot etiquette: only edit if the number has actually changed.
    page_tally, page_remainder = split_tally_from_template_page(page)
    if page_tally == new_tally_str:
        logging.info(f"No change needed for {page_title}. The tally is already {new_tally_str}.")
        return None
    
    logging.info(f"Attempting to change tally on {page_title} from {page_tally} to {new_tally_str}.")
    summary = f"Automated update: changed tally to {new_tally_str} based on {URL_CLERK} or {URL_PRESS_GALLERY}, whichever is determined to be more likely be most recent."
    if not IS_DRY_RUN:
        try:
            page.edit(new_tally_str + page_remainder, summary=summary)
        except Exception as e:
            logging.critical(f"Failed to edit {page_title}; exception follows.")
            logging.critical(e)
            raise e
        else:
            logging.info(f"Successfully edited {page_title}.")
    else:
        logging.info(f"DRY RUN: Would have edited {page_title} with content '{new_tally_str + page_remainder}' and summary '{summary}'")


def update_wikipedia_templates(new_tallies):
    """
    Logs into Wikipedia using bot credentials and edits the tally templates.
    
    Reads credentials from environment variables: ACCESS_TOKEN
    """
    
    access_token = os.getenv("ACCESS_TOKEN")
    auth_headers = {
        'Authorization': f'Bearer {access_token}'
    }
    
    if not IS_DRY_RUN:
        site = mwclient.Site('en.wikipedia.org', scheme="https", clients_useragent=USER_AGENT_WIKIPEDIA, custom_headers=auth_headers)
        info = site.api('query', meta='userinfo')
        username = info['query']['userinfo']['name']
        logging.info(f"Successfully authenticated on Wikipedia as {username}")
    else:
        site = mwclient.Site('en.wikipedia.org', scheme="https", clients_useragent=USER_AGENT_WIKIPEDIA)
        logging.info("DRY RUN: Skipping Wikipedia login.")
    
    # --- Edit Republican Template ---
    num_republicans = new_tallies.get("Republicans")
    update_wikipedia_template(site, "Republican", WIKIPEDIA_REPUBLICAN_TEMPLATE, num_republicans)
    
    # --- Edit Democratic Template ---
    num_democrats = new_tallies.get("Democrats")
    update_wikipedia_template(site, "Democratic", WIKIPEDIA_DEMOCRATIC_TEMPLATE, num_democrats)


def load_previous_data():
    """Loads the previously saved party breakdown data from the JSON file."""
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, 'r') as f:
            logging.info(f"Loading previous data from {DATA_FILE}.")
            return json.load(f)
    logging.warning("No previous data file found.")
    return {} # Return an empty dict if the file doesn't exist


def save_current_data(source, data, timestamp):
    """Saves the current party breakdown data to the JSON file."""
    with open(DATA_FILE, 'r') as f:
        d = json.load(f)
    with open(DATA_FILE, "w") as f:
        d[source]["data"] = data
        d[source]["most_recent_update_noticed_at"] = timestamp
        json.dump(d, f, indent="\t")
    logging.info(f"Data saved to {DATA_FILE}.")


def main():
    """
    Main function to run the scraper, compare data, and report changes.
    """
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(LOG_FILE),
            logging.StreamHandler()
        ]
    )
    
    logging.info("--- Running Scraper ---")
    if IS_DRY_RUN:
        logging.warning("IS_DRY_RUN is enabled. No edits will be made to Wikipedia.")
    
    # 1. Get the current data from the website
    timestamp = datetime.datetime.now(datetime.timezone.utc).isoformat()
    
    current_data_clerk = get_party_breakdown_from_clerk()
    if not current_data_clerk:
        logging.critical("Could not retrieve current data from clerk. Exiting.")
        return 0x1001
    current_data_press_gallery = get_party_breakdown_from_press_gallery()
    if not current_data_press_gallery:
        logging.critical("Could not retrieve current data from press gallery. Exiting.")
        return 0x1002
    if current_data_clerk != current_data_press_gallery:
        logging.warning("clerk site doesn't match press gallery site. attempting to use the more recent one.")
    
    # 2. Load the previously stored data
    previous_data = load_previous_data()
    logging.info(f"Comparing with previous data")
    for source in ["clerk", "press_gallery"]:
        if not previous_data[source]:
            logging.warning(f"No previous {source} data found. Saving current clerk data as the baseline.")
            save_current_data(source, eval(f"current_data_{source}"))
            previous_data = load_previous_data()
            if not previous_data[source]:
                raise ValueError("previous data was not loaded correctly! don't know why.")
            return 0
    
    logging.info(f"current clerk data:\n{json.dumps(current_data_clerk, indent='	')}") # literal tab because backlashes are not allowed in f-strings
    logging.info(f"current press gallery data:\n{json.dumps(current_data_press_gallery, indent='  ')}") # literal tab because backlashes are not allowed in f-strings
    logging.info(f"previous data:\n{json.dumps(previous_data, indent='	')}") # literal tab because backlashes are not allowed in f-strings

    # 3. Compare and check for changes
    if previous_data["clerk"]["data"] != previous_data["press_gallery"]["data"]:
        logging.warning("clerk and press gallery stored data doesn't match.")
    
    if datetime.datetime.fromisoformat(previous_data["clerk"]["most_recent_update_noticed_at"]) > datetime.datetime.fromisoformat(previous_data["press_gallery"]["most_recent_update_noticed_at"]):
        previous_data_real = previous_data["clerk"]["data"]
    else:
        previous_data_real = previous_data["press_gallery"]["data"]
    
    if current_data_clerk == current_data_press_gallery:
        if current_data_clerk == previous_data_real:
            logging.info("Result: No changes detected.")
        else:
            logging.info("Result: CHANGE DETECTED!")
            update_wikipedia_templates(current_data_clerk) # raises an exception if it fails, intentionally preventing the saving of the new data in the line below
            save_current_data("clerk", current_data_clerk, timestamp)
            save_current_data("press_gallery", current_data_press_gallery, timestamp)
    else:
        if current_data_clerk != previous_data["clerk"]["data"] and current_data_press_gallery != previous_data["press_gallery"]["data"]:
            logging.critical("press gallery data, clerk data, and all previous data is out of sync; it's not obvious what info is the most updated without further context.")
            return 0x1003
        if current_data_clerk != previous_data["clerk"]["data"]:
            logging.info("Result: CHANGE DETECTED!")
            logging.warning("assuming clerk data is most recent.")
            update_wikipedia_templates(current_data_clerk) # raises an exception if it fails, intentionally preventing the saving of the new data in the line below
            save_current_data("clerk", current_data_clerk, timestamp)
        else: # due to the conditions above, we know this is the "mirror" of the if statement
            logging.info("Result: CHANGE DETECTED!")
            logging.warning("assuming press_gallery data is most recent.")
            update_wikipedia_templates(current_data_press_gallery) # raises an exception if it fails, intentionally preventing the saving of the new data in the line below
            save_current_data("press_gallery", current_data_press_gallery, timestamp)
    # else:

    logging.info("--- Scraper run finished ---")
    
    return 0


if __name__ == "__main__":
    load_dotenv()
    main()

Related Articles

Wikiwand AI