2024-11-05 07:16:24 +00:00
#!/usr/bin/env luajit
local help = [ [ Usage :
make - epub.lua < config ( JSON file ) > [ action ]
2024-11-06 04:48:33 +00:00
[ action ] : If not specified , all steps will be taken in order ( except clean * ) .
2024-11-05 08:11:37 +00:00
download : All pages will be downloaded to their own HTML files .
2024-11-06 23:12:15 +00:00
convert : Each page is converted to Markdown .
2024-11-05 08:11:37 +00:00
concat : A file is created for each section out of its pages .
markdown : Metadata frontmatter and Markdown section files will be
concatenated into a single Markdown file .
epub : Markdown file will be converted to an ePub using pandoc .
cleanhtml : All HTML files will be deleted , along with their extra
directories .
cleanall : Deletes everything except the ePub .
2024-11-05 07:16:24 +00:00
Requirements :
- Lua libraries : htmlparser , dkjson ( or compatible )
- Binaries : pandoc , curl
2024-11-07 01:03:40 +00:00
For how to write a configuration and examples , see the . lua - files README :
2024-11-07 01:07:43 +00:00
https : // github.com / TangentFoxy / . lua - files # make - epublua
2024-11-05 07:16:24 +00:00
] ]
2024-11-05 07:28:21 +00:00
local success , utility = pcall ( function ( )
2024-11-05 23:19:37 +00:00
return dofile ( ( arg [ 0 ] : match ( " @?(.*/) " ) or arg [ 0 ] : match ( " @?(.* \\ ) " ) ) .. " utility-functions.lua " )
2024-11-05 07:28:21 +00:00
end )
if not success then
print ( " \n \n " .. tostring ( utility ) )
error ( " \n \n This script may be installed improperly. Follow instructions at: \n \t https://github.com/TangentFoxy/.lua-files#installation \n " )
end
2024-11-05 07:53:24 +00:00
local json = utility.require ( " json " )
2024-11-05 23:19:37 +00:00
-- TODO utility.path_separator should be a thing
2024-11-05 07:53:24 +00:00
local path_separator
if utility.OS == " Windows " then
path_separator = " \\ "
else
path_separator = " / "
end
2024-11-05 07:16:24 +00:00
2024-11-07 01:03:40 +00:00
local copyright_warning = " This ebook was created by an automated tool for personal use. It cannot be distributed or sold without permission of copyright holder(s). (If you did not make this ebook, you may be infringing.) \n \n "
local raw_config -- TODO file handling for configs should probably be in the argument parsing portion
2024-11-05 07:16:24 +00:00
-- also checks for errors
-- TODO make it check for required elements and error if any are missing!
local function get_config ( )
2024-11-05 23:19:37 +00:00
-- TODO arg checking REALLY should not be here
2024-11-05 07:16:24 +00:00
if not arg [ 1 ] then
print ( help )
error ( " \n A config file name/path must be specified. " )
elseif arg [ 1 ] == " -h " or arg [ 1 ] == " --help " then
2024-11-05 07:53:24 +00:00
error ( help ) -- I strongly dislike using an error to print a help message instead of gracefully exiting..
2024-11-05 07:16:24 +00:00
end
local file , err = io.open ( arg [ 1 ] , " r " )
if not file then error ( err ) end
2024-11-07 01:03:40 +00:00
raw_config = file : read ( " *a " ) -- TODO file handling for configs should probably be in the argument parsing portion
config = json.decode ( raw_config )
2024-11-05 07:16:24 +00:00
file : close ( )
2024-11-05 23:19:37 +00:00
if not config.authors then
2024-11-07 00:11:15 +00:00
config.authors = { } -- at least have an empty table so it doesn't error below TODO verify that this is actually true
2024-11-06 04:48:33 +00:00
end
if not config.keywords then
config.keywords = { } -- TODO test if it will work empty
2024-11-05 23:19:37 +00:00
end
2024-11-07 00:11:15 +00:00
if config.author then -- old style single author will be prepended to authors list
table.insert ( config.authors , 1 , config.author )
end
2024-11-05 09:25:50 +00:00
-- detecting manually specified sections and flagging it to the rest of the script
if config.sections [ 1 ] then
config.sections . start = 1
config.sections . finish = # config.sections
config.manually_specified_sections = true -- decided to make this part of the config spec, but it's set here again just in case
2024-11-05 16:42:00 +00:00
config.base_url = " http://example.com/ " -- must be defined to prevent errors; it will be manipulated and ignored in this use case
2024-11-05 09:25:50 +00:00
end
2024-11-06 04:48:33 +00:00
if not config.sections . start then
config.sections . start = 1 -- the first one can be optional since the common use case is ALL OF THEM
end
2024-11-05 07:16:24 +00:00
if # config.page_counts ~= config.sections . finish - config.sections . start + 1 then
error ( " Number of page_counts does not match number of sections. " )
end
2024-11-05 23:19:37 +00:00
if config.section_titles and # config.section_titles ~= config.sections . finish - config.sections . start + 1 then
error ( " Number of section_titles does not match number of sections. " )
end
2024-11-05 07:16:24 +00:00
return config
end
local function format_metadata ( config )
local function stringify_list ( list )
2024-11-05 07:28:21 +00:00
local output = " \" " .. utility.escape_quotes ( list [ 1 ] ) .. " \" "
2024-11-05 07:16:24 +00:00
for i = 2 , # list do
2024-11-05 16:57:36 +00:00
output = output .. " , \" " .. utility.escape_quotes ( list [ i ] ) .. " \" "
2024-11-05 07:16:24 +00:00
end
return output
end
local keywords_string = stringify_list ( config.keywords )
local metadata = {
" --- " ,
2024-11-05 07:28:21 +00:00
" title: \" " .. utility.escape_quotes ( config.title ) .. " \" " ,
2024-11-05 23:19:37 +00:00
" author: [ " .. stringify_list ( config.authors ) .. " ] " ,
2024-11-05 07:16:24 +00:00
" keywords: [ " .. keywords_string .. " ] " ,
" tags: [ " .. keywords_string .. " ] " ,
" --- " ,
" " ,
}
return table.concat ( metadata , " \n " ) .. " \n "
end
local function download_pages ( config )
2024-11-05 07:53:24 +00:00
local htmlparser = utility.require ( " htmlparser " )
2024-11-05 07:28:21 +00:00
utility.required_program ( " curl " )
2024-11-05 07:16:24 +00:00
os.execute ( " mkdir Sections " )
for section = config.sections . start , config.sections . finish do
local section_dir = " Sections " .. path_separator .. tostring ( section ) .. path_separator
os.execute ( " mkdir " .. section_dir : sub ( 1 , - 2 ) )
local section_url
if section == 1 and config.first_section_url then
section_url = config.first_section_url
else
2024-11-05 07:28:21 +00:00
section_url = config.base_url .. string.format ( " %02i " , section ) -- leftpad 2 (This will eventually cause problems.)
2024-11-05 07:16:24 +00:00
end
2024-11-05 09:25:50 +00:00
if config.manually_specified_sections then
section_url = config.sections [ section ]
end
2024-11-05 07:16:24 +00:00
for page = 1 , config.page_counts [ section - ( config.sections . start - 1 ) ] do
local download_url
if page == 1 then
download_url = section_url
else
download_url = section_url .. " ?page= " .. tostring ( page )
end
local html_file_name = " .tmp. " .. tostring ( math.random ( ) ) .. " .html "
os.execute ( " curl \" " .. download_url .. " \" > " .. html_file_name )
local html_file , err = io.open ( html_file_name , " r " )
if not html_file then error ( " Could not download \" " .. download_url .. " \" " ) end
local raw_html = html_file : read ( " *a " )
html_file : close ( )
os.execute ( " rm " .. html_file_name )
local parser = htmlparser.parse ( raw_html )
2024-11-05 07:28:21 +00:00
local content_tag = parser : select ( " .article > div > div " ) -- TODO add ability to set selector in config!
2024-11-05 07:16:24 +00:00
local text = content_tag [ 1 ] : getcontent ( )
local page_file , err = io.open ( section_dir .. page .. " .html " , " w " )
if not page_file then error ( err ) end
page_file : write ( text .. " \n " )
page_file : close ( )
os.execute ( " sleep " .. tostring ( math.random ( 5 ) ) ) -- avoid rate limiting
end
end
end
2024-11-06 23:12:15 +00:00
local function convert_pages ( config )
utility.required_program ( " pandoc " )
for section = config.sections . start , config.sections . finish do
local section_dir = " Sections " .. path_separator .. tostring ( section ) .. path_separator
for page = 1 , config.page_counts [ section - ( config.sections . start - 1 ) ] do
local page_file_name_base = section_dir .. page
os.execute ( " pandoc --from html --to markdown \" " .. page_file_name_base .. " .html \" -o \" " .. page_file_name_base .. " .md \" " )
end
end
end
2024-11-05 07:16:24 +00:00
local function concatenate_pages ( config )
for section = config.sections . start , config.sections . finish do
2024-11-06 23:12:15 +00:00
local section_dir = " Sections " .. path_separator .. tostring ( section ) .. path_separator
local section_file , err = io.open ( " Sections " .. path_separator .. tostring ( section ) .. " .md " , " w " )
2024-11-05 07:16:24 +00:00
if not section_file then error ( err ) end
for page = 1 , config.page_counts [ section - ( config.sections . start - 1 ) ] do
2024-11-06 23:12:15 +00:00
local page_file , err = io.open ( section_dir .. page .. " .md " , " r " )
2024-11-05 07:16:24 +00:00
if not page_file then error ( err ) end
2024-11-07 00:05:00 +00:00
if config.sections . automatic_naming then
local naming_patterns = {
" ^Prologue$ " ,
" ^Chapter %d+$ " ,
}
local line = page_file : read ( " *line " )
while line do
for _ , pattern in ipairs ( naming_patterns ) do
if line : find ( pattern ) then
line = " # " .. line
end
end
section_file : write ( line .. " \n " )
line = page_file : read ( " *line " )
end
else
section_file : write ( page_file : read ( " *a " ) )
end
section_file : write ( " \n " ) -- guarantees no accidental line collisions
2024-11-05 07:16:24 +00:00
page_file : close ( )
end
end
end
2024-11-06 23:12:15 +00:00
-- TODO define this earlier, use it to choose where files go (this will also require every command executed to have quotes wrapping it!)
2024-11-05 07:16:24 +00:00
local function get_base_file_name ( config )
2024-11-06 23:12:15 +00:00
-- TODO move make_safe_file_name to utility
2024-11-05 07:16:24 +00:00
local function make_safe_file_name ( file_name )
file_name = file_name : gsub ( " [% \" %:% \\ %!%@%#%$%%%^%*%=%{%}%|%;%<%>%?%/] " , " " ) -- everything except the &
file_name = file_name : gsub ( " %& " , " , " ) -- replacing & with a comma works for 99% of things
file_name = file_name : gsub ( " %& " , " , " ) -- replacing & with a comma works for 99% of things
file_name = file_name : gsub ( " [%s+] " , " " ) -- more than one space in succession should be a single space
return file_name
end
local base_file_name
2024-11-05 23:19:37 +00:00
if config.title and config.authors [ 1 ] then
-- first author in list gets top billing (this is problematic in anthologies unless an editor is the first entry)
base_file_name = config.title .. " by " .. config.authors [ 1 ]
2024-11-05 07:16:24 +00:00
elseif config.title then
base_file_name = config.title
else
base_file_name = " Book "
end
2024-11-07 00:17:58 +00:00
return make_safe_file_name ( config.base_file_name or base_file_name )
2024-11-05 07:16:24 +00:00
end
2024-11-06 23:12:15 +00:00
-- NOTE deprecated (order of operations had to be changed, see #25)
2024-11-05 07:16:24 +00:00
local function convert_sections ( config )
2024-11-05 07:28:21 +00:00
utility.required_program ( " pandoc " )
2024-11-05 07:16:24 +00:00
for section = config.sections . start , config.sections . finish do
local section_file_name = " Sections " .. path_separator .. tostring ( section )
2024-11-06 04:48:33 +00:00
os.execute ( " pandoc --from html --to markdown \" " .. section_file_name .. " .html \" -o \" " .. section_file_name .. " .md \" " )
2024-11-05 07:16:24 +00:00
end
end
local function write_markdown_file ( config )
local markdown_file , err = io.open ( get_base_file_name ( config ) .. " .md " , " w " )
if not markdown_file then error ( err ) end
markdown_file : write ( format_metadata ( config ) )
2024-11-07 01:03:40 +00:00
markdown_file : write ( copyright_warning )
2024-11-05 07:16:24 +00:00
for section = config.sections . start , config.sections . finish do
2024-11-05 09:25:50 +00:00
if config.sections . naming then
2024-11-05 23:19:37 +00:00
markdown_file : write ( " \n \n # " .. config.sections . naming .. " " .. tostring ( section ) )
elseif config.section_titles then
markdown_file : write ( " \n \n # " .. config.section_titles [ section ] )
2024-11-05 09:25:50 +00:00
end
2024-11-05 23:19:37 +00:00
markdown_file : write ( " \n \n " )
2024-11-05 07:16:24 +00:00
local section_file_name = " Sections " .. path_separator .. tostring ( section )
local section_file , err = io.open ( section_file_name .. " .md " , " r " )
if not section_file then error ( err ) end
markdown_file : write ( section_file : read ( " *a " ) )
section_file : close ( )
end
2024-11-07 01:03:40 +00:00
markdown_file : write ( " # Ebook Creation Metadata \n \n " )
markdown_file : write ( copyright_warning )
markdown_file : write ( " This ebook was created using the following config: \n \n " )
markdown_file : write ( " ```json \n " .. raw_config .. " \n ``` \n " )
2024-11-05 07:16:24 +00:00
markdown_file : close ( )
end
local function make_epub ( config )
2024-11-05 07:28:21 +00:00
utility.required_program ( " pandoc " )
2024-11-05 07:16:24 +00:00
local base_file_name = get_base_file_name ( config )
2024-11-06 04:48:33 +00:00
os.execute ( " pandoc --from markdown --to epub \" " .. base_file_name .. " .md \" -o \" " .. base_file_name .. " .epub \" --toc=true " )
2024-11-05 07:16:24 +00:00
end
2024-11-05 08:11:37 +00:00
local function rm_html_files ( config )
2024-11-06 04:58:18 +00:00
os.execute ( " sleep 1 " ) -- attempt to fix #14
2024-11-05 08:11:37 +00:00
for section = config.sections . start , config.sections . finish do
local section_dir = " Sections " .. path_separator .. tostring ( section )
os.execute ( " rm " .. section_dir .. " .html " )
for page = 1 , config.page_counts [ section - ( config.sections . start - 1 ) ] do
os.execute ( " rm " .. section_dir .. path_separator .. page .. " .html " )
end
2024-11-06 23:12:15 +00:00
os.execute ( " rmdir " .. section_dir ) -- NOTE this is no longer possible due to Markdown versions of each page existing
2024-11-05 08:11:37 +00:00
end
end
local function rm_all ( config )
2024-11-06 23:12:15 +00:00
-- TODO use structure of rm_html_files because there's a Markdown file for every HTML file now..
2024-11-05 08:11:37 +00:00
rm_html_files ( config )
for section = config.sections . start , config.sections . finish do
local section_file_name = " Sections " .. path_separator .. tostring ( section ) .. " .md "
os.execute ( " rm " .. section_file_name )
end
os.execute ( " rmdir Sections " )
os.execute ( " rm \" " .. get_base_file_name ( config ) .. " .md \" " )
end
2024-11-05 07:16:24 +00:00
local execute = {
download = download_pages ,
2024-11-06 23:12:15 +00:00
convert = convert_pages ,
2024-11-05 07:16:24 +00:00
concat = concatenate_pages ,
markdown = write_markdown_file ,
epub = make_epub ,
2024-11-05 08:11:37 +00:00
cleanhtml = rm_html_files ,
cleanall = rm_all ,
2024-11-05 07:16:24 +00:00
}
local config = get_config ( )
local action = arg [ 2 ]
if action then
if execute [ action ] then
execute [ action ] ( config )
else
print ( help )
end
else
print ( " \n Downloading pages... \n " )
download_pages ( config )
2024-11-06 23:12:15 +00:00
print ( " \n Converting pages... \n " )
convert_pages ( config )
2024-11-05 07:16:24 +00:00
print ( " \n Concatenating pages... \n " )
concatenate_pages ( config )
print ( " \n Writing Markdown file... \n " )
write_markdown_file ( config )
print ( " \n Making ePub... \n " )
make_epub ( config )
2024-11-06 04:48:33 +00:00
-- print("\nRemoving HTML files...\n")
-- rm_html_files(config)
2024-11-05 07:16:24 +00:00
print ( " \n Done! \n " )
end