Source code for darshan.tests.test_lib_accum

import darshan
from darshan.backend.cffi_backend import accumulate_records
from darshan.lib.accum import log_file_count_summary_table, log_module_overview_table
from darshan.log_utils import get_log_path

import pytest
import pandas as pd
from pandas.testing import assert_frame_equal


[docs]
@pytest.mark.parametrize("log_name, mod_name, expected", [
    # we try to match the "File Count Summary"
    # tables from the old Perl reports, but
    # expected values for file counts
    # are from darshan-parser --file
    # because of issues like gh-867

    # this also means that the average size
    # column on the old Perl reports cannot always
    # be relied upon, since that is calculated
    # using the file counts; furthermore,
    # total_max_offset_bytes is not printed by
    # darshan-parser --file, so the avg size column
    # is not checked quite as robustly as file count
    # and max size, though in cases where the Perl
    # report happens to match the file count, it does
    # seem to match

    # futhermore, the old Perl report doesn't print out
    # the file count summary table for all modules, for
    # example often only showing for POSIX, so in those
    # cases we really just verify the file count and
    # the other columns are regression guards against
    # what we currently have (max size may be available
    # in a subset of these cases as well)
    ("e3sm_io_heatmap_only.darshan",
     "POSIX",
     # <file count> <avg size> <max size>
     [[3, "99.74 GiB", "297.71 GiB"],
      [1, "11.18 MiB", "11.18 MiB"],
      [2, "149.60 GiB", "297.71 GiB"],
      [0, "0", "0"]],
    ),
    ("e3sm_io_heatmap_only.darshan",
     "MPI-IO",
     [[3, "0", "0"],
      [1, "0", "0"],
      [2, "0", "0"],
      [0, "0", "0"]],
    ),
    ("e3sm_io_heatmap_only.darshan",
     "STDIO",
     [[1, "5.80 KiB", "5.80 KiB"],
      [0, "0", "0"],
      [1, "5.80 KiB", "5.80 KiB"],
      [0, "0", "0"]],
    ),
    # the Perl report only gets a very
    # small fraction of these values correct;
    # rely on the parser a bit more here; perhaps
    # because of partial data, etc.
    ("imbalanced-io.darshan",
     "POSIX",
     [[1026, "73.96 MiB", "49.30 GiB"],
      [12, "67.73 MiB", "549.32 MiB"],
      [2, "12.00 GiB", "22.63 GiB"],
      [1, "49.30 GiB", "49.30 GiB"]],
    ),
    ("imbalanced-io.darshan",
     "MPI-IO",
     [[3, "0", "0"],
      [0, "0", "0"],
      [2, "0", "0"],
      [1, "0", "0"]],
    ),
    ("imbalanced-io.darshan",
     "STDIO",
     [[12, "93.12 KiB", "964.00 KiB"],
      [1, "1.81 KiB", "1.81 KiB"],
      [10, "111.56 KiB", "964.00 KiB"],
      [0, "0", "0"]],
    ),
    ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
     "POSIX",
     [[100, "1.84 GiB", "100.00 GiB"],
      [73, "514.56 MiB", "13.84 GiB"],
      [19, "66.86 MiB", "1.23 GiB"],
      [8, "18.30 GiB", "100.00 GiB"]],
    ),
    ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
     "MPI-IO",
     [[59, "0", "0"],
      [50, "0", "0"],
      [9, "0", "0"],
      [0, "0", "0"]],
    ),
    ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
     "STDIO",
     [[16, "81.21 KiB", "524.37 KiB"],
      [9, "4 Bytes", "4 Bytes"],
      [7, "185.62 KiB", "524.37 KiB"],
      [0, "0", "0"]],
    ),
])
def test_file_count_summary_table(log_name,
                                  mod_name,
                                  expected):
    expected_df = pd.DataFrame(expected)
    expected_df.columns = ["number of files",
                           "avg. size",
                           "max size"]
    # the team decided that we should exclude
    # "created" files row from the old report because
    # we can't really determine it reliably
    expected_df.index = ["total files",
                         "read-only files",
                         "write-only files",
                         "read/write files"]
    expected_df.index.rename('type', inplace=True)

    log_path = get_log_path(log_name)
    with darshan.DarshanReport(log_path, read_all=True) as report:
        rec_dict = report.records[mod_name].to_df()
        nprocs = report.metadata['job']['nprocs']

    derived_metrics = accumulate_records(rec_dict, mod_name, nprocs).derived_metrics

    actual_df = log_file_count_summary_table(derived_metrics=derived_metrics,
                                             mod_name=mod_name).df
    assert_frame_equal(actual_df, expected_df)




[docs]
@pytest.mark.parametrize("log_path, mod_name, expected", [
    ("imbalanced-io.darshan",
     "STDIO",
     # <files accessed> <bytes read> <bytes written> <I/O performance estimate>
     [["12", "1.81 KiB", "1.09 MiB", "0.01 MiB/s (average)"]]),
    ("imbalanced-io.darshan",
     "MPI-IO",
     [["3", "49.30 GiB", "74.06 GiB", "101.58 MiB/s (average)"]]),
    # imbalanced-io.darshan does have LUSTRE data,
    # but it doesn't support derived metrics at time
    # of writing
    ("imbalanced-io.darshan",
     "LUSTRE",
     "RuntimeError"),
    # APMPI doesn't support derived metrics either
    ("e3sm_io_heatmap_only.darshan",
     "APMPI",
     "RuntimeError"),
    ("imbalanced-io.darshan",
     "POSIX",
     [["1026", "50.10 GiB", "49.30 GiB", "164.99 MiB/s (average)"]]),
    ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
     "STDIO",
     [["1", "0 Bytes", "151 Bytes", "4.22 MiB/s (average)"]]),
    ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
     "POSIX",
     [["32", "0 Bytes", "32 Bytes", "0.02 MiB/s (average)"]]),
    ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
     "STDIO",
     [["1", "0 Bytes", "1.59 KiB", "16.47 MiB/s (average)"]]),
    ("e3sm_io_heatmap_only.darshan",
     "STDIO",
     [["1", "0 Bytes", "5.80 KiB", "3.26 MiB/s (average)"]]),
    ("e3sm_io_heatmap_only.darshan",
     "MPI-IO",
     [["3", "24.53 MiB", "72.12 GiB", "105.69 MiB/s (average)"]]),
    ("partial_data_stdio.darshan",
     "MPI-IO",
     [["1", "16.00 MiB", "16.00 MiB", "2317.98 MiB/s (average)"]]),
    ("partial_data_stdio.darshan",
     "STDIO",
     [["1022", "0 Bytes", "15.95 GiB", "2999.14 MiB/s (average)"]]),
    # the C derived metrics code can't distinguish
    # between different kinds of errors at this time,
    # but we can still intercept in some cases...
    ("partial_data_stdio.darshan",
     "GARBAGE",
     "ValueError"),
    ("skew-app.darshan",
     "POSIX",
     [["1", "0 Bytes", "40.64 GiB", "157.49 MiB/s (average)"]]),
    ("skew-app.darshan",
     "MPI-IO",
     [["1", "0 Bytes", "40.64 GiB", "55.22 MiB/s (average)"]]),
])
def test_module_overview_table(log_path, mod_name, expected):
    # test the basic scenario of retrieving
    # an overview table for a given module
    log_path = get_log_path(log_path)
    with darshan.DarshanReport(log_path, read_all=False) as report:
        if expected == "ValueError":
            with pytest.raises(ValueError,
                               match=f"mod {mod_name} is not available"):
                report.mod_read_all_records(mod_name)
        else:
            nprocs = report.metadata['job']['nprocs']
            if expected == "RuntimeError":
                # rec_dict not needed to raise this error
                rec_dict = {}
                with pytest.raises(RuntimeError,
                                   match=f"{mod_name} module does not support derived"):
                    accumulate_records(rec_dict, mod_name, nprocs)
            else:
                report.mod_read_all_records(mod_name)
                rec_dict = report.records[mod_name].to_df()

                derived_metrics = accumulate_records(
                    rec_dict,
                    mod_name,
                    nprocs).derived_metrics

                actual_df = log_module_overview_table(
                    derived_metrics=derived_metrics,
                    mod_name=mod_name).df

                # transpose expected series to get a column of expected data
                expected_df = pd.DataFrame(expected).T
                expected_df.index = ["files accessed",
                                     "bytes read",
                                     "bytes written",
                                     "I/O performance estimate"]

                assert_frame_equal(actual_df, expected_df)