#!/usr/bin/ruby -w ## wiki_dump.rb : use MediaWiki API calls to get all pages in a wiki specified ## and then pull the contents and title of each page so that they ## can each be written to a .wiki file. ## Uses code and pattern from bmsft: Everyday Scripting with Ruby, Brian Marrick, Pragmatic Programmers require 'open-uri' ## script config: target wiki @wiki_url = "http://wiki.adric.net/" ### Get all pages page from API and process that into a tuple of page names ### Returns a list of wiki page titles. Use a query like this: ### http://wiki.adric.net/api.php?action=query&list=allpages&aplimit=100 def get_all_page_titles ( wiki_api_url ) url = wiki_api_url + "?action=query&list=allpages&aplimit=100" page = open(url) text = page.read allpages = restrict(text,%r{<allpages>},%{</allpages>}) ## get whole of all paragraph tags like

pagejunk = allpages.scan %r{<p\s+pageid\=(.*)\s+ns=(.*)\s+title=(.*)\s+/>} ## only really need the third field of each of that mess ## lose the quotes and underscores in titles become spaces for sanity pages = pagejunk.collect do |junk| junk[2].gsub(""","").gsub(" ","_") end end ### Get just the contents of a wiki page export ## http://wiki.adric.net/api.php?action=query&export&rvprop=content&titles=2Nov2006&format=text def get_page_contents ( wiki_api_url, wikipage_title ) wikipage_export_url = wiki_api_url + "?action=query&export&rvprop=content&titles=#{wikipage_title}&format=txt" url = wikipage_export_url page = open(url) text = page.read content = restrict_inside(text, %r{}, %r{}) reHTMLise(content) end ## Restrict text to that between two regexes, as for snipping html by tag ## from bmsft Scripting for Testers, p144, in affinity-trip.rb def restrict(html, starting_regexp, stopping_regexp) start = html.index(starting_regexp) stop = html.index(stopping_regexp, start) html[start..stop] end ## Restrict text to that between two regexes, as for snipping html by tag ## from bmsft Scripting for Testers, p144, in affinity-trip.rb ## modified to exclude ends of range with ... operator and addition ## neg eight determined by testing, prob charset specific :( def restrict_inside(html, starting_regexp, stopping_regexp) start = html.index(starting_regexp) + starting_regexp.to_s.size - 8 stop = html.index(stopping_regexp, start) html[start...stop] end ## Turn Mediawiki specific markup back into mostly HTML (3) for portability ## Fossil-SCM only knows a couple bits wiki-syntax, adjust for those ## (bullet list) and use HTML for the rest def reHTMLise (ugly_string) ugly = ugly_string ## mediawiki italics -> i /i ugly.gsub! %r{''(.*\s*.*)''} do |string| "" + $1 + "" end ## mediawiki bold -> b /b ugly.gsub! %r{'''(.*\s*.*)'''} do |string| "" + $1 + "" end ## mediawiki h3 -> h3 /h3, plus line break kludge ugly.gsub! %r{===(.*\s*.*)===} do |string| "\n" + "

" + $1 + "

" end ## mediawiki h2 -> h2 /h2, plus line break kludge ugly.gsub! %r{==(.*\s*.*)==} do |string| "\n" + "

" + $1 + "

" end ## mediawiki h1 -> h1 /h1 ugly.gsub! %r{=(.*\s*.*)=} do |string| "

" + $1 + "

" end ## "e; back to ", thanks ugly.gsub! %r{"} do |string| %q{"} end ## < back to <, thanks ugly.gsub! %r{<} do |string| %q{<} end ## < back to <, thanks ugly.gsub! %r{>} do |string| %q{>} end ## wiki links [[ -> [ ugly.gsub! %q{[[} do |string| %q{[} end ## wiki links ]] -> ] ugly.gsub! %q{]]} do |string| %q{]} end ## mediawiki * list to Fossil * list .. double the spaces ### break into lines for tricky processing uglies = ugly.split "\n" less_uglies = uglies.collect do |u| u.gsub %r{^\s*\*\s(.+[\s]+.+)$} do |string| " * " + $1 end end ugly = less_uglies.join "\n" ## # ## mediawiki * list to
  • # ### break into lines for tricky processing # uglies = ugly.split "\n" # less_uglies = uglies.collect do |u| # u.gsub %r{^\s*\*\s(.+[\s]+.+)$} do |string| # "
  • " + $1 + "
  • " # end # end # ugly = less_uglies.join "\n" # ## ugly end @wiki_api_url = @wiki_url + "api.php" @all_page_titles = get_all_page_titles @wiki_api_url p "Debug: #{@all_page_titles.size} page titles found, first #{@all_page_titles.first}, last #{@all_page_titles.last}" ## obnoxious pages were moved but the name is still choking the script: @all_page_titles.delete "Anda&#039;s_Game" @all_page_titles.delete "Rae&#039;s_Sigils" ## spam @all_page_titles.delete "W" ## whereas these simply don't get to come along for the ride: page_removals = [ "AJ_Sabo", "First_Look_at_the_House", "First_Weekend_of_Summer", "Ghost_Chasers","Initial_Exploration", "Last_Day_of_School", "MW_Main_Page", "Main_Page", "Monica_Sabo", "Quitting_Time", "Rae_Sigils", "Service_Packets", "Sigils", "Spook_Nation", "Summer_Plans", "Summer_Vacation", "Rough_Maps" ] ## so we don't want to copy them down page_removals.each do |page| @all_page_titles.delete page end ## debug, only first and last and Fudooshin for now ##@all_page_titles = [@all_page_titles.first, @all_page_titles.last, "Fudooshin"] @all_page_titles.each do |page_title| ## open file of page_title.wiki @out = File.new("#{page_title}.wiki",'w') page_contents = get_page_contents(@wiki_api_url, page_title) ## p "Debug:" + page_contents ## write page contents to new file @out << page_contents ## close the file @out.close p "Debug: wrote #{@out.inspect}, now for wiki create.." `fossil wiki create #{page_title} #{page_title}.wiki` and p " % " end