Skip to content
Snippets Groups Projects
Select Git revision
  • e2b8a816bdb633f5edb133065dca1c2ba057e097
  • 2023 default
  • pages protected
  • 2022-matse
  • 2022
  • 2021
  • 2019
  • master
8 results

Introduction-to-Pandas--solution.ipynb

Blame
  • SanityCheck-Tutorials.ipynb 26.39 KiB

    Sanity Check

    This notebook checks if the available general tutorials run without errors.

    This particulary tests the installed python packages and their interoperability for standard features and examples. Running these checks can take a lot of time and there for will not finish in the max. cpu-time we provide for a process on a login node. Hence the jupyter kernel for this notebook will eventually be killed by the system. These Sanity Checks are primarily usefull for system administrators.

    If you want to run them anyway - ensure your Jupyter is running as a batch job with far more compute time available.


    Set Path to Tutorials

    import os
    
    # alternative 1: create new directory and download
    from git import Repo
    dwnldir = os.getenv('PROJECT', os.getenv('HOME', '.')) + '/' + os.getenv('USER', '') 
    repo = Repo.clone_from("https://gitlab.version.fz-juelich.de/jupyter4jsc/j4j_notebooks.git", dwnldir)
    workdir = dwnldir + '/' + 'j4j_notebooks'
    
    # alternative 2: use existing download
    #workdir = '/p/project/ccstvs/' + os.getenv('USER', '')  + '/j4j_notebooks'
    
    print(workdir)

    Define Funktions for later Sanity Check

    from typing import Any, Tuple, Dict, Mapping
    from collections import defaultdict
    from nbformat import NotebookNode
    
    def count_source(source: str) -> Tuple[int, int, int]:
        """Count number of non-blank lines, words, and non-whitespace characters.
    
        :param source: string to count
        :return: number of non-blank lines, words, and non-whitespace characters
        """
        lines = [line for line in source.split('\n') if line and not line.isspace()]
        words = source.split()
        chars = ''.join(words)
    
        return len(lines), len(words), len(chars)
    
    REQUIRED_NB_FIELDS = {"metadata", "nbformat_minor", "nbformat", "cells"}
    REQUIRED_NB_METADATA_FIELDS = {"kernelspec", "language_info"}
    CELL_TYPES = ('markdown', 'code', 'raw', )
    REQUIRED_CELL_FIELDS = {
        'markdown': {"cell_type", "metadata", "source"},
        'code': {"cell_type", "metadata", "source", "execution_count", "outputs"},
        'raw': {"cell_type", "metadata", "source"}
    }
    OPTIONAL_CELL_FIELDS = {
        'markdown': {"attachments"},
        'code': set(),
        'raw': {"attachments"}
    }
    OPTIONAL_OUTPUT_TYPES = {
        'execute_result': {'data', 'metadata' ,'execution_count'},
        'stream': {'name', 'text'},
        'display_data': {'data', 'metadata', },
        'error': {'ename', 'evalue', 'traceback'},
    }
    
    CELL_STATISTICS = (
        'cell_types',  #: cell type counts
        'sources', #: cell sources counts
        'cell_metadata',  #: cell metadata counts, including separate ``tags``
        'cell_attachments',  #: cell attachment MIME type counts, and total
        'code_execution',  #: code cell execution count statistics
        'code_outputs',  #: code cell counts per output_type, subcounts per ``stream`` and ``error``, and total
        'cell_extra',  #: counts for extra (unknown) fields in cells
    )
    
    # dictionary keys for source statistics
    EMPTY_SOURCES = 'total empty sources'
    SOURCE_LINES = 'total source lines'
    SOURCE_WORDS = 'total source words'
    SOURCE_CHARS = 'total source chars'
    EMPTY_SOURCES_MD = 'markdown empty sources'
    SOURCE_LINES_MD = 'markdown source lines'
    SOURCE_WORDS_MD = 'markdown source words'
    SOURCE_CHARS_MD = 'markdown source chars'
    EMPTY_SOURCES_CODE = 'code empty sources'
    SOURCE_LINES_CODE = 'code source lines'
    SOURCE_WORDS_CODE = 'code source words'
    SOURCE_CHARS_CODE = 'code source chars'
    EMPTY_SOURCES_RAW = 'raw empty sources'
    SOURCE_LINES_RAW = 'raw source lines'
    SOURCE_WORDS_RAW = 'raw source words'
    SOURCE_CHARS_RAW = 'raw source chars'
    
    def nb_cell_stats(nb: NotebookNode) -> Dict[str, Dict[str, int]]:
        """Count occurrences of various elements in notebook cells.
    
        :param nb: notebook to inspect
        :return: dictionary of dictionaries with counts per section;
            each section has its own key; see CELL_STATISTICS
        """
        # process the notebook cells
        result = {key: defaultdict(int) for key in CELL_STATISTICS}
    
        # traverse all cells and gather statistics
        for index, cell in enumerate(nb.cells):  # index can be used for debug output
            result['cell_types']['total cell count'] += 1  # count all cells
            ct = cell.cell_type
            result['cell_types'][ct] += 1  # count each cell type
    
            # compute source statistics
            lines, words, chars = count_source(cell.source)  # cell.source should always be present
            empty_cell = chars == 0
            if empty_cell:
                result['sources'][EMPTY_SOURCES] += 1
                if ct == 'markdown':
                    result['sources'][EMPTY_SOURCES_MD] += 1
                elif ct == 'code':
                    result['sources'][EMPTY_SOURCES_CODE] += 1
                elif ct == 'raw':
                    result['sources'][EMPTY_SOURCES_RAW] += 1
            if chars:
                result['sources'][SOURCE_LINES] += lines
                result['sources'][SOURCE_WORDS] += words
                result['sources'][SOURCE_CHARS] += chars
                if ct == 'markdown':
                    result['sources'][SOURCE_LINES_MD] += lines
                    result['sources'][SOURCE_WORDS_MD] += words
                    result['sources'][SOURCE_CHARS_MD] += chars
                elif ct == 'code':
                    result['sources'][SOURCE_LINES_CODE] += lines
                    result['sources'][SOURCE_WORDS_CODE] += words
                    result['sources'][SOURCE_CHARS_CODE] += chars
                elif ct == 'raw':
                    result['sources'][SOURCE_LINES_RAW] += lines
                    result['sources'][SOURCE_WORDS_RAW] += words
                    result['sources'][SOURCE_CHARS_RAW] += chars
    
            # count each metadata key
            for attr in cell.metadata:  # cell.metadata should always be present
                    result['cell_metadata'][attr] += 1
    
            # count each tag in tags metadata
            if 'tags' in cell.metadata:
                for tag in cell.metadata.tags:
                    result['cell_metadata']['tag ' + tag] += 1
    
            # count each attachment mime type
            if 'attachments' in cell:
                result['cell_attachments']['total count of cells with attachments'] += 1
                for attachment in cell.attachments.values():
                    for key in attachment:
                        result['cell_attachments']['total attachments count'] += 1
                        result['cell_attachments'][key] += 1
    
            # count non-standard fields in cells
            for field in cell:
                if field not in REQUIRED_CELL_FIELDS[ct].union(OPTIONAL_CELL_FIELDS[ct]):
                    result['cell_extra'][field] += 1
    
        return result
    
    from colorama import Fore, Back, Style
    DEFAULT_WIDTH = 10
    def print_dict(d: Dict[str, Any], header: str=None, width: int=DEFAULT_WIDTH) -> None:
        """Print dictionary d with section header.
    
        :param d: dictionary to print
        :param header: header of the table
        :param width: width of the left column
        """
        if d:
            if header:
                print('{}:'.format(header))
            for key in sorted(d):
                if key == 'raw':
                    style = Fore.RED
                else:
                    style = ''
                left = str(d[key])
                print(style + '  {:>{}} {}'.format(left, width, key) + Style.RESET_ALL)
                
    from pathlib import Path
    from nbformat import NotebookNode
    from typing import List, Union
    import nbformat
    import sys
    
    def read_nb(nb_path: Path) -> Union[None, NotebookNode]:
        """Read notebook from given path, and return it.
        Uses ``args.debug``: in debug mode, a read error results in an exception, else it returns ``None``.
    
        :param nb_path: path to read from
        :param args: to check debug mode
        :return: notebook read from ``nb_path`` or None if reading failed``
        """
        try:
            nb = nbformat.read(nb_path.open(encoding='utf-8'), as_version=4)
        except Exception as e:
            ename = type(e).__name__
            print('Reading of "{}" failed ({}):\n  {}'.format(nb_path.name, ename, e), file=sys.stderr)
            return None
    
        return nb

    Run the Sanity Check

    import os
    import papermill as pm
    from papermill.exceptions import PapermillExecutionError
    
    failed_notebooks = list()
    dirbase = workdir + '/001-Jupyter/001-Tutorials/'
    for dirpath, dirs, files in os.walk(dirbase):
        dirs.sort()
        if os.path.basename(dirpath).startswith('.'):
            continue
        for filename in sorted(files):
            if filename.endswith('.ipynb') and not filename.startswith('papermill_'):
                print(os.path.join(dirpath,filename))
                if filename == "SanityCheck-Tutorials.ipynb":
                    continue
                if filename.endswith('_skip.ipynb'):
                    print("... skipping")
                    continue
                try:
                    os.chdir(dirpath)
                    nb_path = os.path.join(dirpath, filename)
    
                    # get notebook statistics
                    nb = read_nb(Path(nb_path))
                    cell_stats = nb_cell_stats(nb)
                    print_dict(cell_stats['cell_types'], "Cell types")
                    print_dict(cell_stats['sources'], "Cell sources")
                    print_dict(cell_stats['cell_metadata'], "Cell metadata fields")
                    print_dict(cell_stats['cell_attachments'], "Cell attachments")
    
                    # execute notebook
                    try:
                        nb = pm.execute_notebook(
                            nb_path,
                            os.path.join(dirpath, 'papermill_' + filename),
                            #kernel_name="Python3"
                        )
                    except:
                        print("FAILED !!!!")
    
                    os.chdir(dirbase)
                except PapermillExecutionError as e:
                    failed_notebooks.append([os.path.join(dirpath, filename), e.evalue])
                    print(e.evalue)

    Check Results

    failed_notebooks

    Clean Up

    import os
    import papermill as pm
    from papermill.exceptions import PapermillExecutionError
    
    failed_notebooks = list()
    dirbase = workdir + '/001-Jupyter/001-Tutorials/'
    for dirpath, dirs, files in os.walk(dirbase):
        if os.path.basename(dirpath).startswith('.'):
            continue
        for filename in files:
            if filename.endswith('.ipynb') and filename.startswith('papermill_'):
                nb_path = os.path.join(dirpath,filename)
                print(nb_path)
                os.remove(nb_path)

    Remove Output

    import os
    import papermill as pm
    from papermill.exceptions import PapermillExecutionError
    
    failed_notebooks = list()
    dirbase = workdir + '/001-Jupyter/001-Tutorials/'
    for dirpath, dirs, files in os.walk(dirbase):
        if os.path.basename(dirpath).startswith('.'):
            continue
        for filename in files:
            if filename.endswith('.ipynb') and not filename.startswith('papermill_'):
                nb_path = os.path.join(dirpath,filename)
                print(nb_path)            
                !jupyter nbconvert --clear-output --inplace "{nb_path}"