User:Mbear/UpdatesNeededProgram

Requirements

I don't remember exactly. I wrote these two programs several years ago. I know you need Python and the BeautifulSoup module. I've only run this via Python 2.7, but it might work under Python 3.

Updates Needed Builder

The following Python 2.7 program accesses the Updates Needed page, gets a list of links, then examines each of those pages to find the content of "Updates Needed" tag. This information is saved as a table row in an HTML table. Does NOT include characters, because I don't do biography pages. (See next heading.)

import re
import requests
from bs4 import BeautifulSoup

HTMLOpen = """<!DOCTYPE html>
<html>
<head>
    <title>Updates Needed List</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> 
    <style>
        body{font-family:"Arial";font-size:8pt;}
        table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;}
        span.hardcopy {background-color:#5ddfff;}
        span.epub{background-color:#ffc0cb;}
    </style>
</head>
<body>
    <table>
        <thead>
            <tr>
                <th>Page</th>
                <th>Sources</th>
            </tr>
        </thead>
    """

HTMLLineStart = "<tr><td>"
HTMLLineEnd = "</td></tr>"
    
HTMLClose="""    </table>
</body>
</html>"""


rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)"""

UpdateNeededDict = {}

epublist = open('epublist-bare.txt','r').read().splitlines()  #needed to get each item from list without trailing newline characters
booklist = open('booklist.txt','r').read().splitlines()


UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500'
wikibase="http://www.sarna.net"
headers = {
    'User-Agent':'Mbear\'s Update Needed Builder',
    'From':'pae@towerofjade.com'}

r = requests.get(UpdateURL, headers=headers)
text=r.text
soup=BeautifulSoup(text)
linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]")

UpdateList = open('UpdatesNeeded.html','w')
UpdateList.write(HTMLOpen)

characterCount = 0

try:
    for link in linklist:
         title=link.attrs['title']
         if ":" in title:
                print "examining {0}\tSKIPPING FILE - System file".format(title)
         else:
                print "examining {0}".format(title.encode('utf8',errors='ignore')),
                workinglink = wikibase + link.attrs['href']
                req=requests.get(workinglink)
                workingtext = req.text
                souptext=BeautifulSoup(workingtext)
                categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]")
                if len(categorylist) == 0:
                    #print "no characters found"
                    compile_obj = re.compile(rawstr)
                    match_obj = compile_obj.search(workingtext)
                    newstr= BeautifulSoup(match_obj.group(2))
                    sl=[]
                    sourcelist = newstr.select("a[title]")
                    for source in sourcelist:
                        source = source.attrs['title'] #.encode('utf8')
                        #print source.attrs['title'] + ",",
                        if source.lower() in epublist:  #Need to add epublist and book list
                            source = "<span class='epub'>" + source + "</span>"
                            #print "Epub found: {0}".format(source)
                        elif source.lower() in booklist:
                            source = "<span class='hardcopy'>" + source + "</span>"
                            #print "Book found: {0}".format(source)
                        else:
                            source = source
                        #sl.append(source.attrs['title'].encode('utf8'))
                        sl.append(source.encode('utf8'))
                    UpdateNeededDict[title] = sl
                    print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist))
                else:
                    characterCount = characterCount + 1
                    print "\tSKIPPING FILE - Character: {0}".format(characterCount)
                
except: #KeyboardInterrupt:
    for k,v in sorted(UpdateNeededDict.items()):
        osl = ", ".join(v)
        writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
        UpdateList.write(writethis)

for k,v in sorted(UpdateNeededDict.items()):
    osl = ", ".join(v)
    writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
    UpdateList.write(writethis)

UpdateList.write("Character Count: {0}".format(characterCount))
UpdateList.write(HTMLClose)
UpdateList.close()

Updates Needed Builder People

This program builds a list of all characters/people who need to have some information added.

import re
import requests
from bs4 import BeautifulSoup

HTMLOpen = """<!DOCTYPE html>
<html>
<head>
    <title>Characters Updates Needed List</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8"> 
    <style>
        body{font-family:"Arial";font-size:8pt;}
        table,tr,th,td{border:1px solid #000;border-collapse:collapse;padding:1px;}
        span.hardcopy {background-color:#5ddfff;}
        span.epub{background-color:#ffc0cb;}
    </style>
</head>
<body>
    <table>
        <thead>
            <tr>
                <th>Page</th>
                <th>Sources</th>
            </tr>
        </thead>
    """

HTMLLineStart = "<tr><td>"
HTMLLineEnd = "</td></tr>"
    
HTMLClose="""    </table>
</body>
</html>"""


rawstr = """(<b>Update Needed</b><br /><div style="font-size: 90%; text-align: center">This article needs to be updated with material from <i>)(.+)(</i>. Once this title clears the <a href="/wiki/Policy:Moratorium" title="Policy:Moratorium">Moratorium period</a>, or if it already has, please consider revisiting this article and updating it with the new material, removing this tag once all information has been added.)"""

UpdateNeededDict = {}

epublist = open('epublist-bare.txt','r').read().splitlines()  #needed to get each item from list without trailing newline characters
booklist = open('booklist.txt','r').read().splitlines()


UpdateURL = 'http://www.sarna.net/wiki/index.php?title=Special:WhatLinksHere/Template:Update_Needed&limit=1500'
wikibase="http://www.sarna.net"
headers = {
    'User-Agent':'Mbear\'s Update Needed Builder',
    'From':'pae@towerofjade.com'}

r = requests.get(UpdateURL, headers=headers)
text=r.text
soup=BeautifulSoup(text)
linklist=soup.select("ul#mw-whatlinkshere-list > li > a[href]")

UpdateList = open('CharactersUpdateNeeded.html','w')
UpdateList.write(HTMLOpen)

counter = 0

try:
    for link in linklist:
         title=link.attrs['title']
         if ":" in title:
                print "examining {0}\tSKIPPING FILE - System file".format(title)
         else:
                print "examining {0}".format(title.encode('utf8',errors='ignore')),
                workinglink = wikibase + link.attrs['href']
                req=requests.get(workinglink)
                workingtext = req.text
                souptext=BeautifulSoup(workingtext)
                categorylist = souptext.select("div#mw-normal-catlinks > ul > li > a[title*=Characters]")
                if len(categorylist) == 0:
                    counter = counter + 1
                    print "\t({0} of {1})".format(counter,len(linklist))
                else:
                    compile_obj = re.compile(rawstr)
                    match_obj = compile_obj.search(workingtext)
                    newstr= BeautifulSoup(match_obj.group(2))
                    sl=[]
                    sourcelist = newstr.select("a[title]")
                    for source in sourcelist:
                        source = source.attrs['title'] #.encode('utf8')
                        #print source.attrs['title'] + ",",
                        if source.lower() in epublist:  #Need to add epublist and book list
                            source = "<span class='epub'>" + source + "</span>"
                            #print "Epub found: {0}".format(source)
                        elif source.lower() in booklist:
                            source = "<span class='hardcopy'>" + source + "</span>"
                            #print "Book found: {0}".format(source)
                        else:
                            source = source
                        #sl.append(source.attrs['title'].encode('utf8'))
                        sl.append(source.encode('utf8'))
                    UpdateNeededDict[title] = sl
                    print "\t({0} of {1})".format(len(UpdateNeededDict),len(linklist))
                
except: #KeyboardInterrupt:
    for k,v in sorted(UpdateNeededDict.items()):
        osl = ", ".join(v)
        writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
        UpdateList.write(writethis)

for k,v in sorted(UpdateNeededDict.items()):
    osl = ", ".join(v)
    writethis = HTMLLineStart + unicode(k).encode('utf8',errors='ignore') + '</td><td>' + osl +HTMLLineEnd+'\r'
    UpdateList.write(writethis)

UpdateList.write(HTMLClose)
UpdateList.close()