mirror of
https://github.com/TangentFoxy/.lua-files.git
synced 2024-11-20 21:34:23 +00:00
398 lines
14 KiB
Lua
Executable File
398 lines
14 KiB
Lua
Executable File
#!/usr/bin/env luajit
|
|
|
|
local help = [[Usage:
|
|
|
|
make-epub.lua <config (JSON file)> [action] [flag]
|
|
|
|
If "." is used instead of a JSON file, every JSON file in the current directory
|
|
will be used to make multiple ebooks back-to-back.
|
|
|
|
[action]: If not specified, all steps will be taken in order (except cleanall).
|
|
download: All pages will be downloaded to their own HTML files.
|
|
convert: Each page is converted to Markdown.
|
|
concat: A file is created for each section out of its pages.
|
|
markdown: Metadata frontmatter and Markdown section files will be
|
|
concatenated into a single Markdown file.
|
|
epub: Markdown file will be converted to an ePub using pandoc.
|
|
cleanpage: All page files will be deleted, along with their extra
|
|
directories.
|
|
cleanall: Deletes everything except the config file and ePub.
|
|
|
|
[flag]: If "--continue" is passed, script will continue with the default order
|
|
of actions from the action specified.
|
|
|
|
Requirements:
|
|
- Binaries: pandoc, curl
|
|
|
|
For how to write a configuration and examples, see the .lua-files README:
|
|
https://github.com/TangentFoxy/.lua-files#make-epublua
|
|
]]
|
|
|
|
local success, utility = pcall(function()
|
|
return dofile((arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)")) .. "utility-functions.lua")
|
|
end)
|
|
if not success then
|
|
print("\n\n" .. tostring(utility))
|
|
error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n")
|
|
end
|
|
|
|
local path_separator = utility.path_separator
|
|
local copyright_warning = "This ebook was created by an automated tool for personal use. It cannot be distributed or sold without permission of its copyright holder(s). (If you did not make this ebook, you may be infringing.)\n\n"
|
|
|
|
-- also checks for errors TODO make it check for ALL required elements and error if any are missing!
|
|
local function load_config(config_file_text)
|
|
local json = utility.require("json")
|
|
|
|
config = json.decode(config_file_text)
|
|
config.config_file_text = config_file_text
|
|
|
|
if not config.authors then
|
|
config.authors = {} -- at least have an empty table so it doesn't error below TODO verify that this is actually true
|
|
end
|
|
|
|
if not config.keywords then
|
|
config.keywords = {} -- TODO test if it will work empty
|
|
end
|
|
|
|
if config.author then -- old style single author will be prepended to authors list
|
|
table.insert(config.authors, 1, config.author)
|
|
end
|
|
|
|
-- if only using a single section
|
|
if config.first_section_url and not config.base_url then
|
|
config.base_url = config.first_section_url -- prevent errors due to required item being missing
|
|
end
|
|
|
|
if not config.sections then
|
|
config.sections = {} -- I've decided to allow empty sections (defaults to 1 section, for single story ebooks)
|
|
end
|
|
|
|
-- detecting manually specified sections and flagging it to the rest of the script
|
|
if config.sections[1] then
|
|
config.sections.start = 1
|
|
config.sections.finish = #config.sections
|
|
config.manually_specified_sections = true -- decided to make this part of the config spec, but it's set here again just in case
|
|
config.base_url = "http://example.com/" -- must be defined to prevent errors; it will be manipulated and ignored in this use case
|
|
end
|
|
|
|
if not config.sections.start then
|
|
config.sections.start = 1 -- the first one can be optional since the common use case is ALL OF THEM
|
|
end
|
|
|
|
if not config.sections.finish then
|
|
config.sections.finish = 1
|
|
end
|
|
|
|
if #config.page_counts ~= config.sections.finish - config.sections.start + 1 then
|
|
error("Number of page_counts does not match number of sections.")
|
|
end
|
|
|
|
if config.section_titles and #config.section_titles ~= config.sections.finish - config.sections.start + 1 then
|
|
error("Number of section_titles does not match number of sections.")
|
|
end
|
|
|
|
local base_file_name
|
|
if config.title and config.authors[1] then
|
|
-- first author in list gets top billing (this is problematic in anthologies unless an editor is the first entry)
|
|
base_file_name = config.title .. " by " .. config.authors[1]
|
|
elseif config.title then
|
|
base_file_name = config.title
|
|
else
|
|
base_file_name = "Book"
|
|
end
|
|
config.base_file_name = utility.make_safe_file_name(config.base_file_name or base_file_name)
|
|
|
|
return config
|
|
end
|
|
|
|
local function format_metadata(config)
|
|
local function stringify_list(list)
|
|
local output = utility.escape_quotes(list[1]):enquote()
|
|
for i = 2, #list do
|
|
output = output .. ", " .. utility.escape_quotes(list[i]):enquote()
|
|
end
|
|
return output
|
|
end
|
|
|
|
local keywords_string = stringify_list(config.keywords)
|
|
local metadata = {
|
|
"---",
|
|
"title: " .. utility.escape_quotes(config.title):enquote(),
|
|
"author: [" .. stringify_list(config.authors) .. "]",
|
|
"keywords: [" .. keywords_string .. "]",
|
|
"tags: [" .. keywords_string .. "]",
|
|
"---",
|
|
"",
|
|
}
|
|
|
|
return table.concat(metadata, "\n") .. "\n"
|
|
end
|
|
|
|
local function download_pages(config)
|
|
print("\nDownloading pages...\n")
|
|
local htmlparser = utility.require("htmlparser")
|
|
utility.required_program("curl")
|
|
local working_dir = config.base_file_name
|
|
|
|
os.execute("mkdir " .. working_dir:enquote())
|
|
for section = config.sections.start, config.sections.finish do
|
|
local section_dir = working_dir .. path_separator .. tostring(section) .. path_separator
|
|
os.execute("mkdir " .. section_dir:sub(1, -2):enquote())
|
|
|
|
local section_url
|
|
if section == 1 and config.first_section_url then
|
|
section_url = config.first_section_url
|
|
else
|
|
section_url = config.base_url .. string.format("%02i", section) -- leftpad 2 (This will eventually cause problems.)
|
|
end
|
|
|
|
if config.manually_specified_sections then
|
|
section_url = config.sections[section]
|
|
end
|
|
|
|
for page = 1, config.page_counts[section - (config.sections.start - 1)] do
|
|
local download_url
|
|
if page == 1 then
|
|
download_url = section_url
|
|
else
|
|
download_url = section_url .. "?page=" .. tostring(page)
|
|
end
|
|
|
|
local temporary_html_file_name = utility.tmp_file_name()
|
|
os.execute("curl " .. download_url:enquote() .. " > " .. temporary_html_file_name)
|
|
|
|
utility.open(temporary_html_file_name, "r", "Could not download " .. download_url:enquote())(function(html_file)
|
|
local raw_html = html_file:read("*all")
|
|
|
|
local parser = htmlparser.parse(raw_html)
|
|
local content_tag = parser:select(".article > div > div") -- TODO add ability to set selector in config!
|
|
local text = content_tag[1]:getcontent()
|
|
|
|
if page == 1 and config.extract_titles then
|
|
text = parser:select(".headline")[1]:gettext() .. text
|
|
end
|
|
|
|
utility.open(section_dir .. page .. ".html", "w")(function(page_file)
|
|
page_file:write(text .. "\n")
|
|
end)
|
|
end)
|
|
|
|
os.execute("rm " .. temporary_html_file_name)
|
|
os.execute("sleep " .. tostring(math.random(5))) -- avoid rate limiting
|
|
end
|
|
end
|
|
end
|
|
|
|
local function convert_pages(config)
|
|
print("\nConverting pages...\n")
|
|
utility.required_program("pandoc")
|
|
local working_dir = config.base_file_name
|
|
|
|
for section = config.sections.start, config.sections.finish do
|
|
local section_dir = working_dir .. path_separator .. tostring(section) .. path_separator
|
|
|
|
for page = 1, config.page_counts[section - (config.sections.start - 1)] do
|
|
local page_file_name_base = section_dir .. page
|
|
os.execute("pandoc --from html --to markdown " .. (page_file_name_base .. ".html"):enquote() .. " -o " .. (page_file_name_base .. ".md"):enquote())
|
|
end
|
|
end
|
|
end
|
|
|
|
local function concatenate_pages(config)
|
|
print("\nConcatenating pages...\n")
|
|
local working_dir = config.base_file_name
|
|
|
|
for section = config.sections.start, config.sections.finish do
|
|
local section_dir = working_dir .. path_separator .. tostring(section) .. path_separator
|
|
utility.open(working_dir .. path_separator .. tostring(section) .. ".md", "w")(function(section_file)
|
|
for page = 1, config.page_counts[section - (config.sections.start - 1)] do
|
|
utility.open(section_dir .. page .. ".md", "r")(function(page_file)
|
|
if config.sections.automatic_naming then
|
|
local naming_patterns = {
|
|
"^Prologue$",
|
|
"^Chapter %d+$",
|
|
"^%*%*CHAPTER ",
|
|
}
|
|
local line = page_file:read("*line")
|
|
while line do
|
|
for _, pattern in ipairs(naming_patterns) do
|
|
if line:find(pattern) then
|
|
line = "# " .. line
|
|
end
|
|
end
|
|
section_file:write(line .. "\n")
|
|
line = page_file:read("*line")
|
|
end
|
|
else
|
|
section_file:write(page_file:read("*all"))
|
|
end
|
|
section_file:write("\n") -- guarantees no accidental line collisions
|
|
end)
|
|
end
|
|
end)
|
|
end
|
|
end
|
|
|
|
local function write_markdown_file(config)
|
|
print("\nWriting Markdown file...\n")
|
|
local working_dir = config.base_file_name
|
|
|
|
utility.open(config.base_file_name .. ".md", "w")(function(markdown_file)
|
|
markdown_file:write(format_metadata(config))
|
|
markdown_file:write(copyright_warning)
|
|
|
|
for section = config.sections.start, config.sections.finish do
|
|
if config.sections.naming then
|
|
markdown_file:write("\n\n# " .. config.sections.naming .. " " .. tostring(section))
|
|
elseif config.section_titles then
|
|
markdown_file:write("\n\n# " .. config.section_titles[section])
|
|
elseif config.lazy_titling then
|
|
local section_url
|
|
if section == 1 and config.first_section_url then
|
|
section_url = config.first_section_url
|
|
else
|
|
section_url = config.base_url
|
|
end
|
|
if config.manually_specified_sections then
|
|
section_url = config.sections[section]
|
|
end
|
|
|
|
local title_parts = section_url:sub(30):gsplit("-")
|
|
while tonumber(title_parts[#title_parts]) do
|
|
title_parts[#title_parts] = nil
|
|
end
|
|
local last_part = title_parts[#title_parts]
|
|
if last_part == "ch" or last_part == "pt" then
|
|
title_parts[#title_parts] = nil
|
|
end
|
|
for index, part in ipairs(title_parts) do
|
|
title_parts[index] = part:sub(1, 1):upper() .. part:sub(2)
|
|
end
|
|
markdown_file:write("\n\n# " .. table.concat(title_parts, " "))
|
|
end
|
|
markdown_file:write("\n\n")
|
|
|
|
local section_file_name = working_dir .. path_separator .. tostring(section)
|
|
utility.open(section_file_name .. ".md", "r")(function(section_file)
|
|
markdown_file:write(section_file:read("*all"))
|
|
end)
|
|
end
|
|
|
|
markdown_file:write("\n\n# Ebook Creation Metadata\n\n")
|
|
markdown_file:write(copyright_warning)
|
|
markdown_file:write("This ebook was created using the following config:\n\n")
|
|
markdown_file:write("```json\n" .. config.config_file_text .. "\n```\n")
|
|
end)
|
|
end
|
|
|
|
local function make_epub(config)
|
|
print("\nMaking ePub...\n")
|
|
utility.required_program("pandoc")
|
|
local output_dir = "All ePubs"
|
|
os.execute("mkdir " .. output_dir:enquote())
|
|
|
|
local markdown_file_name = config.base_file_name .. ".md"
|
|
local epub_file_name = output_dir .. path_separator .. config.base_file_name .. ".epub"
|
|
os.execute("pandoc --from markdown --to epub " .. markdown_file_name:enquote() .. " -o " .. epub_file_name:enquote() .. " --toc=true")
|
|
end
|
|
|
|
local function rm_page_files(config)
|
|
print("\nRemoving page files...\n")
|
|
local working_dir = config.base_file_name
|
|
|
|
for section = config.sections.start, config.sections.finish do
|
|
local section_dir = working_dir .. path_separator .. tostring(section)
|
|
os.execute(utility.recursive_remove_command .. section_dir:enquote())
|
|
end
|
|
end
|
|
|
|
local function rm_all(config)
|
|
print("\nRemoving all extra files...\n")
|
|
local working_dir = config.base_file_name
|
|
|
|
os.execute(utility.recursive_remove_command .. working_dir:enquote())
|
|
os.execute("rm " .. (config.base_file_name .. ".md"):enquote())
|
|
end
|
|
|
|
local function argparse(arguments, positional_arguments)
|
|
local recognized_arguments = {}
|
|
for index, argument in ipairs(arguments) do
|
|
for _, help in ipairs({"-h", "--help", "/?", "/help", "help"}) do
|
|
if argument == help then
|
|
print(help)
|
|
return nil
|
|
end
|
|
end
|
|
if positional_arguments[index] then
|
|
recognized_arguments[positional_arguments[index]] = argument
|
|
end
|
|
end
|
|
return recognized_arguments
|
|
end
|
|
|
|
local function main(arguments)
|
|
local config = utility.open(arguments.json_file_name, "r")(function(config_file)
|
|
return load_config(config_file:read("*all"))
|
|
end)
|
|
|
|
local actions = {
|
|
download = download_pages,
|
|
convert = convert_pages,
|
|
concat = concatenate_pages,
|
|
markdown = write_markdown_file,
|
|
epub = make_epub,
|
|
cleanpage = rm_page_files,
|
|
cleanall = rm_all,
|
|
}
|
|
local default_action_order = {
|
|
"download",
|
|
"convert",
|
|
"concat",
|
|
"cleanpage",
|
|
"markdown",
|
|
"epub",
|
|
}
|
|
|
|
if arguments.action then
|
|
if actions[arguments.action] then
|
|
actions[arguments.action](config)
|
|
if arguments.flag == "--continue" then
|
|
local starting_point_reached = false
|
|
for _, action in ipairs(default_action_order) do
|
|
if starting_point_reached then
|
|
actions[action](config)
|
|
elseif action == arguments.action then
|
|
starting_point_reached = true
|
|
end
|
|
end
|
|
end
|
|
else
|
|
print(help)
|
|
error("\nInvalid action specified.")
|
|
end
|
|
else
|
|
for _, action in ipairs(default_action_order) do
|
|
actions[action](config)
|
|
end
|
|
end
|
|
print("\nDone!\n")
|
|
end
|
|
|
|
local positional_arguments = {"json_file_name", "action", "flag"}
|
|
local arguments = argparse(arg, positional_arguments)
|
|
if not arguments.json_file_name then
|
|
print(help)
|
|
error("\nA config file name/path must be specified.")
|
|
end
|
|
|
|
if arguments.json_file_name == "." then
|
|
utility.ls(".")(function(file_name)
|
|
if file_name:find(".json$") then
|
|
arguments.json_file_name = file_name
|
|
main(arguments)
|
|
end
|
|
end)
|
|
else
|
|
main(arguments)
|
|
end
|