User:AndreJustAndre/JE script
From Wikipedia, the free encyclopedia
#!/usr/bin/env ruby
require 'excon'
require 'nokogiri'
require 'uri'
# Configuration
BASE_URL = 'https://en.wikipedia.org'
INDEX_PATH = '/wiki/Wikipedia:Jewish_Encyclopedia_topics'
USER_AGENT = 'JewishEncyclopediaStats/1.0 (mailto:your_email@example.com) Ruby script'
def fetch_page(path, follow_redirects: true)
url = "#{BASE_URL}#{path}"
# Configure middlewares: only add RedirectFollower if requested
middlewares = Excon.defaults[:middlewares]
if follow_redirects
middlewares = middlewares + [Excon::Middleware::RedirectFollower]
end
response = Excon.get(
url,
headers: { 'User-Agent' => USER_AGENT },
middlewares: middlewares
)
# Check for redirects (3xx) when follow_redirects is false
if response.status >= 300 && response.status < 400
warn "Skipping #{path}: Redirect detected (Status #{response.status})"
return nil
end
unless response.status == 200
warn "Error fetching #{url}: Status #{response.status}"
return nil
end
Nokogiri::HTML(response.body)
end
def format_wikilink(node)
return "" unless node
# Extract text
text = node.text.strip
# Extract title (page name)
# For blue links, title attribute is usually clean.
# For red links, title is often "PageName (page does not exist)".
title = node['title']
if title
title = title.sub(' (page does not exist)', '')
else
# Fallback to text if title is missing
title = text
end
if title == text
"[[#{title}]]"
else
"[[#{title}|#{text}]]"
end
end
def analyze_subpage(path)
# Use redirect=no to explicitly fetch the page content even if it is a redirect
# This allows us to detect the redirect markup rather than silently following it
# or relying on HTTP 3xx codes which MediaWiki might not always return for internal redirects.
path_with_param = "#{path}?redirect=no"
# We can leave follow_redirects: true because ?redirect=no prevents the server from sending a 302
doc = fetch_page(path_with_param)
return nil unless doc
# Check for MediaWiki redirect indicators in the DOM
# Redirect pages typically have a div with class 'redirectMsg' or ul with 'redirectText'
if doc.at_css('.redirectText') || doc.at_css('.redirectMsg')
warn "Skipping #{path}: Redirect detected via page content."
return nil
end
blue_links = 0
red_links = 0
topic_nodes = []
# Iterate by List Item (li) to count "One topic per numbered item".
doc.css('div.mw-parser-output ol li').each do |li|
# Find the first link that looks like a wiki article link (blue or red)
# Blue links start with /wiki/
# Red links typically have class="new" or href starting with /w/index.php?title=...
link = li.css('a').find do |l|
href = l['href']
next false unless href
is_blue = href.start_with?('/wiki/') && !href.start_with?('/wiki/File:', '/wiki/Category:', '/wiki/Special:', '/wiki/Help:', '/wiki/Wikipedia:')
is_red = l['class']&.split&.include?('new')
is_blue || is_red
end
# If no valid wiki link is found in the item, skip it
next unless link
topic_nodes << link
# Check for the 'new' class which indicates a red link
if link['class']&.split&.include?('new')
red_links += 1
else
blue_links += 1
end
end
total = blue_links + red_links
return nil if total == 0
blue_percent = (blue_links.to_f / total * 100).round(1)
red_percent = (red_links.to_f / total * 100).round(1)
# Determine From - To
from_link = format_wikilink(topic_nodes.first)
to_link = format_wikilink(topic_nodes.last)
range_str = "#{from_link} - #{to_link}"
{
path: path,
range: range_str,
blue: blue_links,
red: red_links,
total: total,
blue_percent: blue_percent,
red_percent: red_percent
}
end
def run
warn "Fetching index: #{INDEX_PATH}..."
# Allow redirects for the main index page just in case
index_doc = fetch_page(INDEX_PATH, follow_redirects: true)
unless index_doc
abort "Failed to retrieve index page."
end
# Extract links and perform a natural sort
# This splits the path into chunks of text and numbers to sort ["A2", "A10"] correctly as A2 then A10
subpage_links = index_doc.css('a').select do |link|
link['href'] =~ %r{^/wiki/Wikipedia:Jewish_Encyclopedia_topics/[A-Z0-9]+$}
end.map { |l| l['href'] }.uniq.sort_by do |s|
s.scan(/(\d+|[a-zA-Z]+)/).map { |match| match[0] =~ /\d/ ? match[0].to_i : match[0] }
end
warn "Found #{subpage_links.size} subpages. Starting analysis..."
# Output Wikitext Table Header matching the requested format
puts '{| class="wikitable sortable"'
puts '|-'
puts "! Page !! From - to !! Initial !!colspan='2' | Blue links !!colspan='2' | Red links"
total_blue = 0
total_red = 0
subpage_links.each do |subpath|
stats = analyze_subpage(subpath)
if stats
page_name = subpath.split('/').last
# Formatting percentages with % sign
b_pct = "#{stats[:blue_percent]}%"
r_pct = "#{stats[:red_percent]}%"
puts '|-'
puts "| [[Wikipedia:Jewish Encyclopedia topics/#{page_name}|#{page_name}]] || #{stats[:range]} || #{stats[:total]} || #{stats[:blue]} || #{b_pct} || #{stats[:red]} || #{r_pct}"
total_blue += stats[:blue]
total_red += stats[:red]
end
# Be polite to Wikipedia servers
sleep 0.5
end
grand_total = total_blue + total_red
if grand_total > 0
grand_blue_pct = (total_blue.to_f / grand_total * 100).round(1)
grand_red_pct = (total_red.to_f / grand_total * 100).round(1)
puts '|-'
puts "| || '''Total''' || '''#{grand_total}''' || '''#{total_blue}''' || #{grand_blue_pct}% || '''#{total_red}''' || #{grand_red_pct}%"
puts '|}'
puts "Total JE % of blue: #{grand_blue_pct}%"
else
puts '|}'
end
end
run