manual/import.rb

require 'nokogiri'
require 'fileutils'
require 'open-uri'

URL = 'http://ardour.org/book/export/html/5848'
FILENAME = 'drupal-export.html'

WRITE = true
DOWNLOAD_FILES = false
GET_ARDOUR_ORG_IMAGES = false
HANDLE_OTHER_IMAGES = false

OUTPUT_DIR = '_manual'

FILES_DIR = 'source'

SLUG_MAPPINGS = {
    'working_with_sessions' => 'sessions',
    'export_stem' => 'export',
    'track_groups' => 'track_bus_groups',
    'vst_support' => 'windows_vst',
    'kbd_default' => 'default_bindings',
    'midistep_entry' => 'midi_step_entry',
    'midi_stepentry' => 'midi_step_entry'
}

MISSING_SLUGS = %w(
    range_selection
    track_templates
    track_template
    color_dialog
    region_layering
    round_robin_inputs
    mcp_osx
    mcp_new_device
)

FILES_MAPPINGS = {
    '/files/a3_mnemonic_cheatsheet.pdf' => '/files/ardour-2.8.3-bindings-x.pdf',
    '/files/a3_mnemonic_cheat_sheet_osx.pdf' => '/files/ardour-2.8.3-bindings-osx-a4.pdf'
}

LINK_SLUG_TO_NODE_ID = {}

def link_slug_to_node_id(slug)

    slug = SLUG_MAPPINGS[slug] || slug

    return nil if MISSING_SLUGS.include? slug

    LINK_SLUG_TO_NODE_ID[slug] ||= begin
        filename = "tmp/slug-to-node/#{slug}"

        if File.exists? filename
            File.read(filename).to_i
        else
            url = "http://ardour.org/manual/#{slug}"
            puts "opening #{url}"
            node_id = Nokogiri(open(url)).at('#content .node')['id'].sub(/^node\-/,'').to_i
            File.open(filename,'w+') { |f| f << node_id }
            node_id
        end
    end
end


def register_node(node_id, path)
    filename = "tmp/node-to-path/#{node_id}"
    File.open(filename,'w+') { |f| f << path } unless File.exists? filename
end

def node_id_to_path!(node_id)
    filename = "tmp/node-to-path/#{node_id}"
    return '' unless File.exists? filename
    #raise "no path for node-id #{node_id}" unless File.exists? filename
    File.read(filename)
end

def process(html, level = 1, path = [], numbered_path = [])
    html.search("div.section-#{level}").each_with_index do |child, i|

        title = child.at('h1.book-heading').inner_text

        node_id = child['id'].sub(/^node\-/,'')


        slug = title.downcase.gsub(' ','-').gsub(/[^a-z0-9\-]/, '')

        root = slug == 'the-ardour3-manual'

        if root

            # top level

            this_path = []
            this_numbered_path = []
        else
            numbered_slug = "%02d_%s" % [i + 1, slug, node_id]

            this_path = path + [slug]
            this_numbered_path = numbered_path + [numbered_slug]
        end

        register_node node_id, this_path.join('/')

        indent = ' ' * level * 3

        has_children = child.search("div.section-#{level + 1}").length > 0 #&& possible_children.any? { |child| child.search('div').length > 0 }

        output_dir = "#{OUTPUT_DIR}/#{this_numbered_path.join('/')}"

        output_file = case 
        when root
            "#{OUTPUT_DIR}/blah.html"
        #when has_children
        #    "#{output_dir}/index.html"
        else
            "#{output_dir}.html"
        end

        content = child.dup

        content.search('h1.book-heading').remove
        content.search("div.section-#{level + 1}").remove

        if heading = content.at('h2') and heading.inner_text == title
            heading.remove
        end

        #puts "processing links in [#{this_path.join('/')}]"

        content.search('a').each do |a|
            href = a['href']
            case href
            when /^\/manual\/(.*)/
                slug = $1
                if node_id = link_slug_to_node_id(slug)
                    link_path = node_id_to_path! node_id
                    #puts " link slug [#{slug}] -> #{node_id} -> #{link_path}"
                    a['href'] = "/#{link_path}"
                else
                    a['href'] = "/missing"
                end

            when /^(\/files\/.*)/

                if DOWNLOAD_FILES
                    file_path = $1


                    if FILES_MAPPINGS[file_path]
                        file_path = FILES_MAPPINGS[file_path]
                        a['href'] = file_path
                    end

                    puts "downloading [#{file_path}] (for #{this_path.join('/')})"

                    filename = "#{FILES_DIR}/#{file_path}"
                    FileUtils.mkdir_p File.dirname(filename)
                    File.open(filename,'w+') { |f| f << open("http://ardour.org/#{file_path}").read }
                end
            end
        end

        content.search('img').each do |img|

            src = img['src']

            case src
            when /^\//
                if GET_ARDOUR_ORG_IMAGES
                    url = "http://ardour.org#{src}"
                    puts "getting #{url}"
                    img_path = "#{FILES_DIR}#{src}"
                    FileUtils.mkdir_p File.dirname(img_path)
                    File.open(img_path, 'w+') { |f| f << open(url).read }
                end
            when /^http/
                new_src = '/' + src.sub(/^http:\/\/[^\/]+\//,'')
                img['src'] = new_src
                    
                if HANDLE_OTHER_IMAGES
                    puts "new_src: #{new_src}"
                    img_path = "#{FILES_DIR}#{new_src}"
                    FileUtils.mkdir_p File.dirname(img_path)
                    puts "getting #{src}"
                    File.open(img_path, 'w+') { |f| f << open(src).read }
                end
            end

        end

        if WRITE
            FileUtils.mkdir_p output_dir if has_children
            File.open(output_file, 'w:UTF-8') do |f| 
                f << <<-HTML
---
layout: default
title: #{title}
---                        

#{content.inner_html}
                HTML

                if has_children
                    f << <<-HTML
{% children %}
                    HTML
                end


            end
        end

        process(child, level + 1, this_path, this_numbered_path)
    end
end


unless File.exists?(FILENAME)
    puts "downloading #{URL} to #{FILENAME}"
    File.open(FILENAME,'w+') { |f| f << open(URL).read }
end

FileUtils.mkdir_p('tmp/node-to-path')
FileUtils.mkdir_p('tmp/slug-to-node')

process Nokogiri(File.read(FILENAME))
initial commit 2013-01-29 19:26:36 -05:00			`require 'nokogiri'`
			`require 'fileutils'`
			`require 'open-uri'`

			`URL = 'http://ardour.org/book/export/html/5848'`
			`FILENAME = 'drupal-export.html'`

			`WRITE = true`
			`DOWNLOAD_FILES = false`
			`GET_ARDOUR_ORG_IMAGES = false`
			`HANDLE_OTHER_IMAGES = false`

			`OUTPUT_DIR = '_manual'`

			`FILES_DIR = 'source'`

			`SLUG_MAPPINGS = {`
			`'working_with_sessions' => 'sessions',`
			`'export_stem' => 'export',`
			`'track_groups' => 'track_bus_groups',`
			`'vst_support' => 'windows_vst',`
			`'kbd_default' => 'default_bindings',`
			`'midistep_entry' => 'midi_step_entry',`
			`'midi_stepentry' => 'midi_step_entry'`
			`}`

			`MISSING_SLUGS = %w(`
			`range_selection`
			`track_templates`
			`track_template`
			`color_dialog`
			`region_layering`
			`round_robin_inputs`
			`mcp_osx`
			`mcp_new_device`
			`)`

			`FILES_MAPPINGS = {`
			`'/files/a3_mnemonic_cheatsheet.pdf' => '/files/ardour-2.8.3-bindings-x.pdf',`
			`'/files/a3_mnemonic_cheat_sheet_osx.pdf' => '/files/ardour-2.8.3-bindings-osx-a4.pdf'`
			`}`

			`LINK_SLUG_TO_NODE_ID = {}`

			`def link_slug_to_node_id(slug)`

			`slug = SLUG_MAPPINGS[slug] \|\| slug`

			`return nil if MISSING_SLUGS.include? slug`

			`LINK_SLUG_TO_NODE_ID[slug] \|\|= begin`
			`filename = "tmp/slug-to-node/#{slug}"`

			`if File.exists? filename`
			`File.read(filename).to_i`
			`else`
			`url = "http://ardour.org/manual/#{slug}"`
			`puts "opening #{url}"`
			`node_id = Nokogiri(open(url)).at('#content .node')['id'].sub(/^node\-/,'').to_i`
			`File.open(filename,'w+') { \|f\| f << node_id }`
			`node_id`
			`end`
			`end`
			`end`


			`def register_node(node_id, path)`
			`filename = "tmp/node-to-path/#{node_id}"`
			`File.open(filename,'w+') { \|f\| f << path } unless File.exists? filename`
			`end`

			`def node_id_to_path!(node_id)`
			`filename = "tmp/node-to-path/#{node_id}"`
			`return '' unless File.exists? filename`
			`#raise "no path for node-id #{node_id}" unless File.exists? filename`
			`File.read(filename)`
			`end`

			`def process(html, level = 1, path = [], numbered_path = [])`
			`html.search("div.section-#{level}").each_with_index do \|child, i\|`

			`title = child.at('h1.book-heading').inner_text`

			`node_id = child['id'].sub(/^node\-/,'')`


			`slug = title.downcase.gsub(' ','-').gsub(/[^a-z0-9\-]/, '')`

			`root = slug == 'the-ardour3-manual'`

			`if root`

			`# top level`

			`this_path = []`
			`this_numbered_path = []`
			`else`
			`numbered_slug = "%02d_%s" % [i + 1, slug, node_id]`

			`this_path = path + [slug]`
			`this_numbered_path = numbered_path + [numbered_slug]`
			`end`

			`register_node node_id, this_path.join('/')`

			`indent = ' ' * level * 3`

			`has_children = child.search("div.section-#{level + 1}").length > 0 #&& possible_children.any? { \|child\| child.search('div').length > 0 }`

			`output_dir = "#{OUTPUT_DIR}/#{this_numbered_path.join('/')}"`

			`output_file = case`
			`when root`
			`"#{OUTPUT_DIR}/blah.html"`
			`#when has_children`
			`# "#{output_dir}/index.html"`
			`else`
			`"#{output_dir}.html"`
			`end`

			`content = child.dup`

			`content.search('h1.book-heading').remove`
			`content.search("div.section-#{level + 1}").remove`

			`if heading = content.at('h2') and heading.inner_text == title`
			`heading.remove`
			`end`

			`#puts "processing links in [#{this_path.join('/')}]"`

			`content.search('a').each do \|a\|`
			`href = a['href']`
			`case href`
			`when /^\/manual\/(.*)/`
			`slug = $1`
			`if node_id = link_slug_to_node_id(slug)`
			`link_path = node_id_to_path! node_id`
			`#puts " link slug [#{slug}] -> #{node_id} -> #{link_path}"`
			`a['href'] = "/#{link_path}"`
			`else`
			`a['href'] = "/missing"`
			`end`

			`when /^(\/files\/.*)/`

			`if DOWNLOAD_FILES`
			`file_path = $1`


			`if FILES_MAPPINGS[file_path]`
			`file_path = FILES_MAPPINGS[file_path]`
			`a['href'] = file_path`
			`end`

			`puts "downloading [#{file_path}] (for #{this_path.join('/')})"`

			`filename = "#{FILES_DIR}/#{file_path}"`
			`FileUtils.mkdir_p File.dirname(filename)`
			`File.open(filename,'w+') { \|f\| f << open("http://ardour.org/#{file_path}").read }`
			`end`
			`end`
			`end`

			`content.search('img').each do \|img\|`

			`src = img['src']`

			`case src`
			`when /^\//`
			`if GET_ARDOUR_ORG_IMAGES`
			`url = "http://ardour.org#{src}"`
			`puts "getting #{url}"`
			`img_path = "#{FILES_DIR}#{src}"`
			`FileUtils.mkdir_p File.dirname(img_path)`
			`File.open(img_path, 'w+') { \|f\| f << open(url).read }`
			`end`
			`when /^http/`
			`new_src = '/' + src.sub(/^http:\/\/[^\/]+\//,'')`
			`img['src'] = new_src`

			`if HANDLE_OTHER_IMAGES`
			`puts "new_src: #{new_src}"`
			`img_path = "#{FILES_DIR}#{new_src}"`
			`FileUtils.mkdir_p File.dirname(img_path)`
			`puts "getting #{src}"`
			`File.open(img_path, 'w+') { \|f\| f << open(src).read }`
			`end`
			`end`

			`end`

			`if WRITE`
			`FileUtils.mkdir_p output_dir if has_children`
			`File.open(output_file, 'w:UTF-8') do \|f\|`
			`f << <<-HTML`
			`---`
			`layout: default`
			`title: #{title}`
			`---`

			`#{content.inner_html}`
			`HTML`

			`if has_children`
			`f << <<-HTML`
			`{% children %}`
			`HTML`
			`end`


			`end`
			`end`

			`process(child, level + 1, this_path, this_numbered_path)`
			`end`
			`end`


			`unless File.exists?(FILENAME)`
			`puts "downloading #{URL} to #{FILENAME}"`
			`File.open(FILENAME,'w+') { \|f\| f << open(URL).read }`
			`end`

			`FileUtils.mkdir_p('tmp/node-to-path')`
			`FileUtils.mkdir_p('tmp/slug-to-node')`

			`process Nokogiri(File.read(FILENAME))`