#!/usr/bin/env luajit local help = [[Usage: make-epub.lua [action] [action]: If not specified, all steps will be taken in order (except cleanall). download: All pages will be downloaded to their own HTML files. concat: A file is created for each section out of its pages. convert: Each section is converted to Markdown. markdown: Metadata frontmatter and Markdown section files will be concatenated into a single Markdown file. epub: Markdown file will be converted to an ePub using pandoc. cleanhtml: All HTML files will be deleted, along with their extra directories. cleanall: Deletes everything except the ePub. Requirements: - Lua libraries: htmlparser, dkjson (or compatible) - Binaries: pandoc, curl Configuration example: { "author": "Name", "title": "Book", "keywords": ["fantasy", "dragon", "isekai"], "base_url": "https://www.literotica.com/s/title-ch-", -- not required if only one section "first_section_url": "https://www.literotica.com/s/title", "sections": { "start": 1, "finish": 5, "naming": "Chapter" -- not required, but will screw up the Table of Contents if absent }, "page_counts": [1, 5, 3] } ]] local success, utility = pcall(function() return dofile(arg[0]:match("@?(.*/)") or arg[0]:match("@?(.*\\)") .. "utility-functions.lua") end) if not success then print("\n\n" .. tostring(utility)) error("\n\nThis script may be installed improperly. Follow instructions at:\n\thttps://github.com/TangentFoxy/.lua-files#installation\n") end local json = utility.require("json") local path_separator if utility.OS == "Windows" then path_separator = "\\" else path_separator = "/" end -- also checks for errors -- TODO make it check for required elements and error if any are missing! local function get_config() if not arg[1] then print(help) error("\nA config file name/path must be specified.") elseif arg[1] == "-h" or arg[1] == "--help" then error(help) -- I strongly dislike using an error to print a help message instead of gracefully exiting.. end local file, err = io.open(arg[1], "r") if not file then error(err) end config = json.decode(file:read("*a")) file:close() -- detecting manually specified sections and flagging it to the rest of the script if config.sections[1] then config.sections.start = 1 config.sections.finish = #config.sections config.manually_specified_sections = true -- decided to make this part of the config spec, but it's set here again just in case config.base_url = "http://example.com/" -- must be defined to prevent errors; it will be manipulated and ignored in this use case end if #config.page_counts ~= config.sections.finish - config.sections.start + 1 then error("Number of page_counts does not match number of sections.") end return config end local function format_metadata(config) local function stringify_list(list) local output = "\"" .. utility.escape_quotes(list[1]) .. "\"" for i = 2, #list do output = output .. ", \"" .. utility.escape_quotes(list[i]) .. "\"" end return output end local keywords_string = stringify_list(config.keywords) local metadata = { "---", "title: \"" .. utility.escape_quotes(config.title) .. "\"", "author:", "- \"" .. utility.escape_quotes(config.author) .. "\"", "keywords: [" .. keywords_string .. "]", "tags: [" .. keywords_string .. "]", "---", "", } return table.concat(metadata, "\n") .. "\n" end local function download_pages(config) local htmlparser = utility.require("htmlparser") utility.required_program("curl") os.execute("mkdir Sections") for section = config.sections.start, config.sections.finish do local section_dir = "Sections" .. path_separator .. tostring(section) .. path_separator os.execute("mkdir " .. section_dir:sub(1, -2)) local section_url if section == 1 and config.first_section_url then section_url = config.first_section_url else section_url = config.base_url .. string.format("%02i", section) -- leftpad 2 (This will eventually cause problems.) end if config.manually_specified_sections then section_url = config.sections[section] end for page = 1, config.page_counts[section - (config.sections.start - 1)] do local download_url if page == 1 then download_url = section_url else download_url = section_url .. "?page=" .. tostring(page) end local html_file_name = ".tmp." .. tostring(math.random()) .. ".html" os.execute("curl \"" ..download_url .. "\" > " .. html_file_name) local html_file, err = io.open(html_file_name, "r") if not html_file then error("Could not download \"" .. download_url .. "\"") end local raw_html = html_file:read("*a") html_file:close() os.execute("rm " .. html_file_name) local parser = htmlparser.parse(raw_html) local content_tag = parser:select(".article > div > div") -- TODO add ability to set selector in config! local text = content_tag[1]:getcontent() local page_file, err = io.open(section_dir .. page .. ".html", "w") if not page_file then error(err) end page_file:write(text .. "\n") page_file:close() os.execute("sleep " .. tostring(math.random(5))) -- avoid rate limiting end end end local function concatenate_pages(config) for section = config.sections.start, config.sections.finish do local section_dir = "Sections" ..path_separator .. tostring(section) .. path_separator local section_file, err = io.open("Sections" .. path_separator .. tostring(section) .. ".html", "w") if not section_file then error(err) end for page = 1, config.page_counts[section - (config.sections.start - 1)] do local page_file, err = io.open(section_dir .. page .. ".html", "r") if not page_file then error(err) end section_file:write(page_file:read("*a") .. "\n") page_file:close() end end end local function get_base_file_name(config) -- TODO move this function to utility local function make_safe_file_name(file_name) file_name = file_name:gsub("[%\"%:%\\%!%@%#%$%%%^%*%=%{%}%|%;%<%>%?%/]", "") -- everything except the & file_name = file_name:gsub(" %&", ",") -- replacing & with a comma works for 99% of things file_name = file_name:gsub("%&", ",") -- replacing & with a comma works for 99% of things file_name = file_name:gsub("[%s+]", " ") -- more than one space in succession should be a single space return file_name end local base_file_name if config.title and config.author then base_file_name = config.title .. " by " .. config.author elseif config.title then base_file_name = config.title else base_file_name = "Book" end return make_safe_file_name(base_file_name) end local function convert_sections(config) -- the HTML I'm pulling from is often bugged in a way that breaks ebook readers, but pandoc can understand and fix in Markdown utility.required_program("pandoc") for section = config.sections.start, config.sections.finish do local section_file_name = "Sections" .. path_separator .. tostring(section) os.execute("pandoc \"" .. section_file_name .. ".html\" -o \"" .. section_file_name .. ".md\"") end end local function write_markdown_file(config) local markdown_file, err = io.open(get_base_file_name(config) .. ".md", "w") if not markdown_file then error(err) end markdown_file:write(format_metadata(config)) for section = config.sections.start, config.sections.finish do if config.sections.naming then markdown_file:write("\n\n# " .. config.sections.naming .. " " .. tostring(section) .. "\n\n") else markdown_file:write("\n\n\n\n") -- TODO add ability to manually specify names for manually listed sections end local section_file_name = "Sections" .. path_separator .. tostring(section) local section_file, err = io.open(section_file_name .. ".md", "r") if not section_file then error(err) end markdown_file:write(section_file:read("*a")) section_file:close() end markdown_file:close() end local function make_epub(config) utility.required_program("pandoc") local base_file_name = get_base_file_name(config) os.execute("pandoc \"" .. base_file_name .. ".md\" -o \"" .. base_file_name .. ".epub\" --toc=true") end local function rm_html_files(config) for section = config.sections.start, config.sections.finish do local section_dir = "Sections" .. path_separator .. tostring(section) os.execute("rm " .. section_dir .. ".html") for page = 1, config.page_counts[section - (config.sections.start - 1)] do os.execute("rm " .. section_dir .. path_separator .. page .. ".html") end os.execute("rmdir " .. section_dir) end end local function rm_all(config) rm_html_files(config) for section = config.sections.start, config.sections.finish do local section_file_name = "Sections" .. path_separator .. tostring(section) .. ".md" os.execute("rm " .. section_file_name) end os.execute("rmdir Sections") os.execute("rm \"" .. get_base_file_name(config) .. ".md\"") end local execute = { download = download_pages, concat = concatenate_pages, convert = convert_sections, markdown = write_markdown_file, epub = make_epub, cleanhtml = rm_html_files, cleanall = rm_all, } local config = get_config() local action = arg[2] if action then if execute[action] then execute[action](config) else print(help) end else print("\nDownloading pages...\n") download_pages(config) print("\nConcatenating pages...\n") concatenate_pages(config) print("\nConverting sections...\n") convert_sections(config) print("\nWriting Markdown file...\n") write_markdown_file(config) print("\nMaking ePub...\n") make_epub(config) print("\nRemoving HTML files...\n") rm_html_files(config) print("\nDone!\n") end