User:Somepinkdude/PyWikiBot.py
From Wikipedia, the free encyclopedia
###########################################################
#This is a PyWikiBot script for a bot I plan to request. #
#It fixes undefined references by looking through the #
#page's history for a definition. #
###########################################################
import pywikibot
import re
import urllib.request as web
def main(pagename):
#Basic definitions valid for each reference fixed.
site=pywikibot.Site('en', 'wikipedia')
page=pywikibot.Page(site, pagename)
#Get HTML code of WP page
wikifiedPagename=pagename.replace(" ", "_")
WPcode=web.urlopen('https://en.wikipedia.org/wiki/'+wikifiedPagename)#Doesn't work
citeErrors=re.findall("Cite error: The named reference <code>[^<>]+</code> was invoked but never defined"
, WPcode)
for i in range(len(citeErrors)):
#Remove the "Cite error..." text to leave the ref names.
errorNames[i]=citeErrors[i].replace("Cite error: The named reference <code>", "")
errorNames[i]=citeErrors[i].replace("</code> was invoked but never defined", "")
#Start the actual program, which is repeated for each reference name
for refname in errorNames:
print(page)
txt=page.text
print(txt)
versions = page.revisions(content=True)
references = []
# Compile a list of all references with name refname in the
# revision history
refname_regex = f"< ?ref[^<>/]+name ?= ?\"{refname}\"[^></]+>[^<>]+< ?/ ?ref ?>"
for rev in versions:
# Finds references named with refname that are not self-closing
namedrefs=re.findall(refname_regex, rev.text)
for i in namedrefs:
references[index]=i
index=index+1
bigref=0
finalref="{{unreferenced}}"
if compare_title(references)==1:
# Go through all of the references, and note the value and
# size of the largest reference with the given title
for i in references:
if len(i)>bigref:
bigref=len(i)
finalref=i
# Replace the first self-closing reference with the generated definition
ref_tag=f"< ?ref[^<>/]+name ?= ?\"{refname}\"[^></]+/ ?>"
newtxt=re.sub(ref_tag, finalref, txt, count=1)
# Save as long as the page does not exclude bots
if not bool(re.search("{{ ?nobots ?}}|{{ ?bots[^}]+deny ?= ?all", txt)):
page.text=newtxt
page.save("Bot edit: fixing undefined references")
print(newtxt)
def compare_title(reflist):
"""Compare the titles of the page, and return True if they are
identical, False otherwise"""
for i in range(len(reflist)):
refA=reflist[i]
# Find "title" properties. Ideally, there should only be one
# title property, inside the {{cite}} template.
try:
titlesA=re.findall("title ?= ?\"[^\"><]+\"", refA)
except Error:
print("Had an error")
# Remove ' title=" ' from title string
cliptitleA=re.sub("title ?= ?\"", "", titlesA)
# Remove quotes and whitespace
cliptitleA=re.sub("[\"' ]", "", cliptitleA)
titles[i]=cliptitleA
for i in range(len(titles)):
if not (titles[i]==titles[i+1]):
return False
return True
if __name__ == "__main__":
main('Mr. Jaggers')