generate_book.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. from subprocess import check_call
  2. import os
  3. import os.path as op
  4. import sys
  5. import shutil as sh
  6. import yaml
  7. from nbclean import NotebookCleaner
  8. from tqdm import tqdm
  9. import numpy as np
  10. from glob import glob
  11. from uuid import uuid4
  12. import argparse
  13. DESCRIPTION = ("Convert a collection of Jupyter Notebooks into Jekyll "
  14. "markdown suitable for a course textbook.")
  15. # Add path to our utility functions
  16. this_folder = op.dirname(op.abspath(__file__))
  17. sys.path.append(op.join(this_folder, 'scripts'))
  18. from jupyterbook.utils import (_split_yaml, _check_url_page, _prepare_toc,
  19. _prepare_url, _clean_notebook_cells, _error)
  20. parser = argparse.ArgumentParser(description=DESCRIPTION)
  21. parser.add_argument("--site-root", default=None, help="Path to the root of the textbook repository.")
  22. parser.add_argument("--path-template", default=None, help="Path to the template nbconvert uses to build markdown files")
  23. parser.add_argument("--path-config", default=None, help="Path to the Jekyll configuration file")
  24. parser.add_argument("--path-toc", default=None, help="Path to the Table of Contents YAML file")
  25. parser.add_argument("--overwrite", action='store_true', help="Overwrite md files if they already exist.")
  26. parser.add_argument("--execute", action='store_true', help="Execute notebooks before converting to MD.")
  27. parser.add_argument("--local-build", action='store_true',
  28. help="Specify you are building site locally for later upload.")
  29. parser.set_defaults(overwrite=False, execute=False)
  30. # Defaults
  31. BUILD_FOLDER_NAME = "_build"
  32. SUPPORTED_FILE_SUFFIXES = ['.ipynb', '.md']
  33. def _clean_lines(lines, filepath):
  34. """Replace images with jekyll image root and add escape chars as needed."""
  35. inline_replace_chars = ['#']
  36. # Images: replace absolute nbconvert image paths to baseurl paths
  37. path_rel_root = op.relpath(PATH_SITE_ROOT, op.dirname(filepath))
  38. path_rel_root_one_up = path_rel_root.replace('../', '', 1)
  39. for ii, line in enumerate(lines):
  40. # Handle relative paths because we remove `content/` from the URL
  41. # If there's a path that goes back to the root, remove a level`
  42. # This is for images referenced directly in the markdown
  43. if path_rel_root in line:
  44. line = line.replace(path_rel_root, path_rel_root_one_up)
  45. # For programmatically-generated images from notebooks, replace the abspath with relpath
  46. line = line.replace(PATH_IMAGES_FOLDER, op.relpath(PATH_IMAGES_FOLDER, op.dirname(filepath)))
  47. # Adding escape slashes since Jekyll removes them when it serves the page
  48. # Make sure we have at least two dollar signs and they
  49. # Aren't right next to each other
  50. dollars = np.where(['$' == char for char in line])[0]
  51. if len(dollars) > 2 and all(ii > 1 for ii in (dollars[1:] - dollars[:1])):
  52. for char in inline_replace_chars:
  53. line = line.replace('\\{}'.format(char), '\\\\{}'.format(char))
  54. line = line.replace(' \\$', ' \\\\$')
  55. lines[ii] = line
  56. return lines
  57. def _copy_non_content_files():
  58. """Copy non-markdown/notebook files in the content folder into build folder so relative links work."""
  59. all_files = glob(op.join(PATH_CONTENT_FOLDER, '**', '*'), recursive=True)
  60. non_content_files = [ii for ii in all_files if not any(ii.endswith(ext) for ext in SUPPORTED_FILE_SUFFIXES)]
  61. for ifile in non_content_files:
  62. if op.isdir(ifile):
  63. continue
  64. # The folder name may change if the permalink sanitizing changes it.
  65. # this ensures that a new folder exists if needed
  66. new_path = ifile.replace(os.sep + CONTENT_FOLDER_NAME, os.sep + BUILD_FOLDER_NAME)
  67. if not op.isdir(op.dirname(new_path)):
  68. os.makedirs(op.dirname(new_path))
  69. sh.copy2(ifile, new_path)
  70. def _case_sensitive_fs(path):
  71. """True when filesystem at `path` is case sensitive, False otherwise.
  72. Checks this by attempting to write two files, one w/ upper case, one
  73. with lower. If after this only one file exists, the system is case-insensitive.
  74. Makes directory `path` if it does not exist.
  75. """
  76. if not op.exists(path):
  77. os.makedirs(path)
  78. root = op.join(path, uuid4().hex)
  79. fnames = [root + suffix for suffix in 'aA']
  80. try:
  81. for fname in fnames:
  82. with open(fname, 'wt') as fobj:
  83. fobj.write('text')
  84. written = glob(root + '*')
  85. finally:
  86. for fname in written:
  87. os.unlink(fname)
  88. return len(written) == 2
  89. if __name__ == '__main__':
  90. ###############################################################################
  91. # Default values and arguments
  92. args = parser.parse_args()
  93. overwrite = bool(args.overwrite)
  94. execute = bool(args.execute)
  95. if args.site_root is None:
  96. args.site_root = op.join(op.dirname(op.abspath(__file__)), '..')
  97. # Paths for our notebooks
  98. PATH_SITE_ROOT = op.abspath(args.site_root)
  99. PATH_TOC_YAML = args.path_toc if args.path_toc is not None else op.join(PATH_SITE_ROOT, '_data', 'toc.yml')
  100. CONFIG_FILE = args.path_config if args.path_config is not None else op.join(PATH_SITE_ROOT, '_config.yml')
  101. PATH_TEMPLATE = args.path_template if args.path_template is not None else op.join(PATH_SITE_ROOT, 'scripts', 'templates', 'jekyllmd.tpl')
  102. PATH_IMAGES_FOLDER = op.join(PATH_SITE_ROOT, '_build', 'images')
  103. BUILD_FOLDER = op.join(PATH_SITE_ROOT, BUILD_FOLDER_NAME)
  104. ###############################################################################
  105. # Read in textbook configuration
  106. # Load the yaml for this site
  107. with open(CONFIG_FILE, 'r') as ff:
  108. site_yaml = yaml.load(ff.read())
  109. CONTENT_FOLDER_NAME = site_yaml.get('content_folder_name').strip('/')
  110. PATH_CONTENT_FOLDER = op.join(PATH_SITE_ROOT, CONTENT_FOLDER_NAME)
  111. # Load the textbook yaml for this site
  112. if not op.exists(PATH_TOC_YAML):
  113. _error("No toc.yml file found, please create one")
  114. with open(PATH_TOC_YAML, 'r') as ff:
  115. toc = yaml.load(ff.read())
  116. # Drop divider items and non-linked pages in the sidebar, un-nest sections
  117. toc = _prepare_toc(toc)
  118. ###############################################################################
  119. # Generating the Jekyll files for all content
  120. n_skipped_files = 0
  121. n_built_files = 0
  122. case_check = _case_sensitive_fs(BUILD_FOLDER) and args.local_build
  123. print("Convert and copy notebook/md files...")
  124. for ix_file, page in enumerate(tqdm(list(toc))):
  125. url_page = page.get('url', None)
  126. title = page.get('title', None)
  127. # Make sure URLs (file paths) have correct structure
  128. _check_url_page(url_page, CONTENT_FOLDER_NAME)
  129. ###############################################################################
  130. # Create path to old/new file and create directory
  131. # URL will be relative to the CONTENT_FOLDER
  132. path_url_page = os.path.join(PATH_CONTENT_FOLDER, url_page.lstrip('/'))
  133. path_url_folder = os.path.dirname(path_url_page)
  134. # URLs shouldn't have the suffix in there already so now we find which one to add
  135. for suf in SUPPORTED_FILE_SUFFIXES:
  136. if op.exists(path_url_page + suf):
  137. path_url_page = path_url_page + suf
  138. break
  139. if not op.exists(path_url_page):
  140. raise _error("Could not find file called {} with any of these extensions: {}".format(path_url_page, SUPPORTED_FILE_SUFFIXES))
  141. # Create and check new folder / file paths
  142. path_new_folder = path_url_folder.replace(os.sep + CONTENT_FOLDER_NAME, os.sep + BUILD_FOLDER_NAME)
  143. path_new_file = op.join(path_new_folder, op.basename(path_url_page).replace('.ipynb', '.md'))
  144. if overwrite is False and op.exists(path_new_file) \
  145. and os.stat(path_new_file).st_mtime > os.stat(path_url_page).st_mtime:
  146. n_skipped_files += 1
  147. continue
  148. if not op.isdir(path_new_folder):
  149. os.makedirs(path_new_folder)
  150. ###############################################################################
  151. # Generate previous/next page URLs
  152. if ix_file == 0:
  153. url_prev_page = ''
  154. prev_file_title = ''
  155. else:
  156. prev_file_title = toc[ix_file-1].get('title')
  157. url_prev_page = toc[ix_file-1].get('url')
  158. url_prev_page = _prepare_url(url_prev_page)
  159. if ix_file == len(toc) - 1:
  160. url_next_page = ''
  161. next_file_title = ''
  162. else:
  163. next_file_title = toc[ix_file+1].get('title')
  164. url_next_page = toc[ix_file+1].get('url')
  165. url_next_page = _prepare_url(url_next_page)
  166. ###############################################################################
  167. # Content conversion
  168. # Convert notebooks or just copy md if no notebook.
  169. if path_url_page.endswith('.ipynb'):
  170. # Create a temporary version of the notebook we can modify
  171. tmp_notebook = path_url_page + '_TMP'
  172. sh.copy2(path_url_page, tmp_notebook)
  173. ###############################################################################
  174. # Notebook cleaning
  175. # Clean up the file before converting
  176. cleaner = NotebookCleaner(tmp_notebook)
  177. cleaner.remove_cells(empty=True)
  178. if site_yaml.get('hide_cell_text', False):
  179. cleaner.remove_cells(search_text=site_yaml.get('hide_cell_text'))
  180. if site_yaml.get('hide_code_text', False):
  181. cleaner.clear(kind="content", search_text=site_yaml.get('hide_code_text'))
  182. cleaner.clear('stderr')
  183. cleaner.save(tmp_notebook)
  184. _clean_notebook_cells(tmp_notebook)
  185. ###############################################################################
  186. # Conversion to Jekyll Markdown
  187. # Run nbconvert moving it to the output folder
  188. # This is the output directory for `.md` files
  189. build_call = '--FilesWriter.build_directory={}'.format(path_new_folder)
  190. # Copy notebook output images to the build directory using the base folder name
  191. path_after_build_folder = path_new_folder.split(os.sep + BUILD_FOLDER_NAME + os.sep)[-1]
  192. nb_output_folder = op.join(PATH_IMAGES_FOLDER, path_after_build_folder)
  193. images_call = '--NbConvertApp.output_files_dir={}'.format(nb_output_folder)
  194. call = ['jupyter', 'nbconvert', '--log-level="CRITICAL"',
  195. '--to', 'markdown', '--template', PATH_TEMPLATE,
  196. images_call, build_call, tmp_notebook]
  197. if execute is True:
  198. call.insert(-1, '--execute')
  199. check_call(call)
  200. os.remove(tmp_notebook)
  201. elif path_url_page.endswith('.md'):
  202. # If a non-notebook file, just copy it over.
  203. # If markdown we'll add frontmatter later
  204. sh.copy2(path_url_page, path_new_file)
  205. else:
  206. raise _error("Files must end in ipynb or md. Found file {}".format(path_url_page))
  207. ###############################################################################
  208. # Modify the generated Markdown to work with Jekyll
  209. # Clean markdown for Jekyll quirks (e.g. extra escape characters)
  210. with open(path_new_file, 'r') as ff:
  211. lines = ff.readlines()
  212. lines = _clean_lines(lines, path_new_file)
  213. # Split off original yaml
  214. yaml_orig, lines = _split_yaml(lines)
  215. # Front-matter YAML
  216. yaml_fm = []
  217. yaml_fm += ['---']
  218. # In case pre-existing links are sanitized
  219. sanitized = url_page.lower().replace('_', '-')
  220. if sanitized != url_page:
  221. if case_check and url_page.lower() == sanitized:
  222. raise RuntimeError(
  223. 'Redirect {} clashes with page {} for local build on '
  224. 'case-insensitive FS\n'.format(sanitized, url_page) +
  225. 'Rename source page to lower case or build on a case '
  226. 'sensitive FS, e.g. case-sensitive disk image on Mac')
  227. yaml_fm += ['redirect_from:']
  228. yaml_fm += [' - "{}"'.format(sanitized)]
  229. if ix_file == 0:
  230. if not sanitized != url_page:
  231. yaml_fm += ['redirect_from:']
  232. yaml_fm += [' - "/"']
  233. if path_url_page.endswith('.ipynb'):
  234. interact_path = 'content/' + path_url_page.split('content/')[-1]
  235. yaml_fm += ['interact_link: {}'.format(interact_path)]
  236. yaml_fm += ["title: '{}'".format(title)]
  237. yaml_fm += ['prev_page:']
  238. yaml_fm += [' url: {}'.format(url_prev_page)]
  239. yaml_fm += [" title: '{}'".format(prev_file_title)]
  240. yaml_fm += ['next_page:']
  241. yaml_fm += [' url: {}'.format(url_next_page)]
  242. yaml_fm += [" title: '{}'".format(next_file_title)]
  243. # Add back any original YaML, and end markers
  244. yaml_fm += yaml_orig
  245. yaml_fm += ['comment: "***PROGRAMMATICALLY GENERATED, DO NOT EDIT. SEE ORIGINAL FILES IN /{}***"'.format(CONTENT_FOLDER_NAME)]
  246. yaml_fm += ['---']
  247. yaml_fm = [ii + '\n' for ii in yaml_fm]
  248. lines = yaml_fm + lines
  249. # Write the result
  250. with open(path_new_file, 'w') as ff:
  251. ff.writelines(lines)
  252. n_built_files += 1
  253. ###############################################################################
  254. # Finishing up...
  255. # Copy non-markdown files in notebooks/ in case they're referenced in the notebooks
  256. print('Copying non-content files inside `{}/`...'.format(CONTENT_FOLDER_NAME))
  257. _copy_non_content_files()
  258. # Message at the end
  259. print("\n===========")
  260. print("Generated {} new files\nSkipped {} already-built files".format(n_built_files, n_skipped_files))
  261. if n_built_files == 0:
  262. print("Delete the markdown files in '{}' for any pages that you wish to re-build, or use --overwrite option to re-build all.".format(BUILD_FOLDER_NAME))
  263. print("\nYour Jupyter Book is now in `{}/`.".format(BUILD_FOLDER_NAME))
  264. print("\nDemo your Jupyter book with `make serve` or push to GitHub!")
  265. print('===========\n')