User:Somepinkdude/PyWikiBot.py

From Wikipedia, the free encyclopedia

###########################################################
#This is a PyWikiBot script for a bot I plan to request.  #
#It fixes undefined references by looking through the     #
#page's history for a definition.                         #
###########################################################

import pywikibot
import re
import urllib.request as web

def main(pagename):
    #Basic definitions valid for each reference fixed.
    site=pywikibot.Site('en', 'wikipedia')
    page=pywikibot.Page(site, pagename)
    #Get HTML code of WP page
    wikifiedPagename=pagename.replace(" ", "_")
    WPcode=web.urlopen('https://en.wikipedia.org/wiki/'+wikifiedPagename)#Doesn't work
    citeErrors=re.findall("Cite error: The named reference <code>[^<>]+</code> was invoked but never defined"
                          , WPcode)
    for i in range(len(citeErrors)):
        #Remove the "Cite error..." text to leave the ref names.
        errorNames[i]=citeErrors[i].replace("Cite error: The named reference <code>", "")
        errorNames[i]=citeErrors[i].replace("</code> was invoked but never defined", "")
    #Start the actual program, which is repeated for each reference name
    for refname in errorNames:
        print(page)

        txt=page.text
        print(txt)

        versions = page.revisions(content=True)
        references = []


        # Compile a list of all references with name refname in the
        # revision history
        refname_regex = f"< ?ref[^<>/]+name ?= ?\"{refname}\"[^></]+>[^<>]+< ?/ ?ref ?>"
        for rev in versions:
            # Finds references named with refname that are not self-closing
            namedrefs=re.findall(refname_regex, rev.text)
            for i in namedrefs:
                references[index]=i
                index=index+1

        bigref=0

        finalref="{{unreferenced}}"
        if compare_title(references)==1:
            # Go through all of the references, and note the value and
            # size of the largest reference with the given title

            for i in references:
                if len(i)>bigref:
                    bigref=len(i)
                    finalref=i

        # Replace the first self-closing reference with the generated definition
        ref_tag=f"< ?ref[^<>/]+name ?= ?\"{refname}\"[^></]+/ ?>"
        newtxt=re.sub(ref_tag, finalref, txt, count=1)

        # Save as long as the page does not exclude bots
        if not bool(re.search("{{ ?nobots ?}}|{{ ?bots[^}]+deny ?= ?all", txt)):
            page.text=newtxt
            page.save("Bot edit: fixing undefined references")
            print(newtxt)

def compare_title(reflist):
    """Compare the titles of the page, and return True if they are
    identical, False otherwise"""
    for i in range(len(reflist)):
        refA=reflist[i]
        # Find "title" properties.  Ideally, there should only be one
        # title property, inside the {{cite}} template.
        try:
            titlesA=re.findall("title ?= ?\"[^\"><]+\"", refA)
        except Error:
            print("Had an error")

        # Remove ' title=" ' from title string
        cliptitleA=re.sub("title ?= ?\"", "", titlesA)

        # Remove quotes and whitespace
        cliptitleA=re.sub("[\"' ]", "", cliptitleA)
        titles[i]=cliptitleA
        for i in range(len(titles)):
            if not (titles[i]==titles[i+1]):
                return False
        return True

if __name__ == "__main__":
    main('Mr. Jaggers')

Related Articles

Wikiwand AI