#!/usr/bin/ruby -w
## wiki_dump.rb : use MediaWiki API calls to get all pages in a wiki specified
## and then pull the contents and title of each page so that they
## can each be written to a .wiki file.
## Uses code and pattern from bmsft: Everyday Scripting with Ruby, Brian Marrick, Pragmatic Programmers
require 'open-uri'
## script config: target wiki
@wiki_url = "http://wiki.adric.net/"
### Get all pages page from API and process that into a tuple of page names
### Returns a list of wiki page titles. Use a query like this:
### http://wiki.adric.net/api.php?action=query&list=allpages&aplimit=100
def get_all_page_titles ( wiki_api_url )
url = wiki_api_url + "?action=query&list=allpages&aplimit=100"
page = open(url)
text = page.read
allpages = restrict(text,%r{<allpages>},%{</allpages>})
## get whole of all paragraph tags like
pagejunk = allpages.scan %r{<p\s+pageid\=(.*)\s+ns=(.*)\s+title=(.*)\s+/>}
## only really need the third field of each of that mess
## lose the quotes and underscores in titles become spaces for sanity
pages = pagejunk.collect do |junk| junk[2].gsub(""","").gsub(" ","_") end
end
### Get just the contents of a wiki page export
## http://wiki.adric.net/api.php?action=query&export&rvprop=content&titles=2Nov2006&format=text
def get_page_contents ( wiki_api_url, wikipage_title )
wikipage_export_url = wiki_api_url + "?action=query&export&rvprop=content&titles=#{wikipage_title}&format=txt"
url = wikipage_export_url
page = open(url)
text = page.read
content = restrict_inside(text, %r{}, %r{})
reHTMLise(content)
end
## Restrict text to that between two regexes, as for snipping html by tag
## from bmsft Scripting for Testers, p144, in affinity-trip.rb
def restrict(html, starting_regexp, stopping_regexp)
start = html.index(starting_regexp)
stop = html.index(stopping_regexp, start)
html[start..stop]
end
## Restrict text to that between two regexes, as for snipping html by tag
## from bmsft Scripting for Testers, p144, in affinity-trip.rb
## modified to exclude ends of range with ... operator and addition
## neg eight determined by testing, prob charset specific :(
def restrict_inside(html, starting_regexp, stopping_regexp)
start = html.index(starting_regexp) + starting_regexp.to_s.size - 8
stop = html.index(stopping_regexp, start)
html[start...stop]
end
## Turn Mediawiki specific markup back into mostly HTML (3) for portability
## Fossil-SCM only knows a couple bits wiki-syntax, adjust for those
## (bullet list) and use HTML for the rest
def reHTMLise (ugly_string)
ugly = ugly_string
## mediawiki italics -> i /i
ugly.gsub! %r{''(.*\s*.*)''} do |string|
"" + $1 + ""
end
## mediawiki bold -> b /b
ugly.gsub! %r{'''(.*\s*.*)'''} do |string|
"" + $1 + ""
end
## mediawiki h3 -> h3 /h3, plus line break kludge
ugly.gsub! %r{===(.*\s*.*)===} do |string|
"\n" + "" + $1 + "
"
end
## mediawiki h2 -> h2 /h2, plus line break kludge
ugly.gsub! %r{==(.*\s*.*)==} do |string|
"\n" + "" + $1 + "
"
end
## mediawiki h1 -> h1 /h1
ugly.gsub! %r{=(.*\s*.*)=} do |string|
"" + $1 + "
"
end
## "e; back to ", thanks
ugly.gsub! %r{"} do |string|
%q{"}
end
## < back to <, thanks
ugly.gsub! %r{<} do |string|
%q{<}
end
## < back to <, thanks
ugly.gsub! %r{>} do |string|
%q{>}
end
## wiki links [[ -> [
ugly.gsub! %q{[[} do |string|
%q{[}
end
## wiki links ]] -> ]
ugly.gsub! %q{]]} do |string|
%q{]}
end
## mediawiki * list to Fossil * list .. double the spaces
### break into lines for tricky processing
uglies = ugly.split "\n"
less_uglies = uglies.collect do |u|
u.gsub %r{^\s*\*\s(.+[\s]+.+)$} do |string|
" * " + $1
end
end
ugly = less_uglies.join "\n"
##
# ## mediawiki * list to
# ### break into lines for tricky processing
# uglies = ugly.split "\n"
# less_uglies = uglies.collect do |u|
# u.gsub %r{^\s*\*\s(.+[\s]+.+)$} do |string|
# "" + $1 + ""
# end
# end
# ugly = less_uglies.join "\n"
# ##
ugly
end
@wiki_api_url = @wiki_url + "api.php"
@all_page_titles = get_all_page_titles @wiki_api_url
p "Debug: #{@all_page_titles.size} page titles found, first #{@all_page_titles.first}, last #{@all_page_titles.last}"
## obnoxious pages were moved but the name is still choking the script:
@all_page_titles.delete "Anda's_Game"
@all_page_titles.delete "Rae's_Sigils"
## spam
@all_page_titles.delete "W"
## whereas these simply don't get to come along for the ride:
page_removals = [ "AJ_Sabo", "First_Look_at_the_House", "First_Weekend_of_Summer",
"Ghost_Chasers","Initial_Exploration", "Last_Day_of_School", "MW_Main_Page", "Main_Page",
"Monica_Sabo", "Quitting_Time", "Rae_Sigils", "Service_Packets", "Sigils", "Spook_Nation",
"Summer_Plans", "Summer_Vacation", "Rough_Maps" ]
## so we don't want to copy them down
page_removals.each do |page|
@all_page_titles.delete page
end
## debug, only first and last and Fudooshin for now
##@all_page_titles = [@all_page_titles.first, @all_page_titles.last, "Fudooshin"]
@all_page_titles.each do |page_title|
## open file of page_title.wiki
@out = File.new("#{page_title}.wiki",'w')
page_contents = get_page_contents(@wiki_api_url, page_title)
## p "Debug:" + page_contents
## write page contents to new file
@out << page_contents
## close the file
@out.close
p "Debug: wrote #{@out.inspect}, now for wiki create.."
`fossil wiki create #{page_title} #{page_title}.wiki` and p " % "
end