Source code for darshan.tests.test_lib_accum

import darshan
from darshan.backend.cffi_backend import accumulate_records
from darshan.lib.accum import log_file_count_summary_table, log_module_overview_table
from darshan.log_utils import get_log_path

import pytest
import pandas as pd
from pandas.testing import assert_frame_equal

[docs] @pytest.mark.parametrize("log_name, mod_name, expected", [ # we try to match the "File Count Summary" # tables from the old Perl reports, but # expected values for file counts # are from darshan-parser --file # because of issues like gh-867 # this also means that the average size # column on the old Perl reports cannot always # be relied upon, since that is calculated # using the file counts; furthermore, # total_max_offset_bytes is not printed by # darshan-parser --file, so the avg size column # is not checked quite as robustly as file count # and max size, though in cases where the Perl # report happens to match the file count, it does # seem to match # futhermore, the old Perl report doesn't print out # the file count summary table for all modules, for # example often only showing for POSIX, so in those # cases we really just verify the file count and # the other columns are regression guards against # what we currently have (max size may be available # in a subset of these cases as well) ("e3sm_io_heatmap_only.darshan", "POSIX", # <file count> <avg size> <max size> [[3, "99.74 GiB", "297.71 GiB"], [1, "11.18 MiB", "11.18 MiB"], [2, "149.60 GiB", "297.71 GiB"], [0, "0", "0"]], ), ("e3sm_io_heatmap_only.darshan", "MPI-IO", [[3, "0", "0"], [1, "0", "0"], [2, "0", "0"], [0, "0", "0"]], ), ("e3sm_io_heatmap_only.darshan", "STDIO", [[1, "5.80 KiB", "5.80 KiB"], [0, "0", "0"], [1, "5.80 KiB", "5.80 KiB"], [0, "0", "0"]], ), # the Perl report only gets a very # small fraction of these values correct; # rely on the parser a bit more here; perhaps # because of partial data, etc. ("imbalanced-io.darshan", "POSIX", [[1026, "73.96 MiB", "49.30 GiB"], [12, "67.73 MiB", "549.32 MiB"], [2, "12.00 GiB", "22.63 GiB"], [1, "49.30 GiB", "49.30 GiB"]], ), ("imbalanced-io.darshan", "MPI-IO", [[3, "0", "0"], [0, "0", "0"], [2, "0", "0"], [1, "0", "0"]], ), ("imbalanced-io.darshan", "STDIO", [[12, "93.12 KiB", "964.00 KiB"], [1, "1.81 KiB", "1.81 KiB"], [10, "111.56 KiB", "964.00 KiB"], [0, "0", "0"]], ), ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", "POSIX", [[100, "1.84 GiB", "100.00 GiB"], [73, "514.56 MiB", "13.84 GiB"], [19, "66.86 MiB", "1.23 GiB"], [8, "18.30 GiB", "100.00 GiB"]], ), ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", "MPI-IO", [[59, "0", "0"], [50, "0", "0"], [9, "0", "0"], [0, "0", "0"]], ), ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", "STDIO", [[16, "81.21 KiB", "524.37 KiB"], [9, "4 Bytes", "4 Bytes"], [7, "185.62 KiB", "524.37 KiB"], [0, "0", "0"]], ), ]) def test_file_count_summary_table(log_name, mod_name, expected): expected_df = pd.DataFrame(expected) expected_df.columns = ["number of files", "avg. size", "max size"] # the team decided that we should exclude # "created" files row from the old report because # we can't really determine it reliably expected_df.index = ["total files", "read-only files", "write-only files", "read/write files"] expected_df.index.rename('type', inplace=True) log_path = get_log_path(log_name) with darshan.DarshanReport(log_path, read_all=True) as report: rec_dict = report.records[mod_name].to_df() nprocs = report.metadata['job']['nprocs'] derived_metrics = accumulate_records(rec_dict, mod_name, nprocs).derived_metrics actual_df = log_file_count_summary_table(derived_metrics=derived_metrics, mod_name=mod_name).df assert_frame_equal(actual_df, expected_df)
[docs] @pytest.mark.parametrize("log_path, mod_name, expected", [ ("imbalanced-io.darshan", "STDIO", # <files accessed> <bytes read> <bytes written> <I/O performance estimate> [["12", "1.81 KiB", "1.09 MiB", "0.01 MiB/s (average)"]]), ("imbalanced-io.darshan", "MPI-IO", [["3", "49.30 GiB", "74.06 GiB", "101.58 MiB/s (average)"]]), # imbalanced-io.darshan does have LUSTRE data, # but it doesn't support derived metrics at time # of writing ("imbalanced-io.darshan", "LUSTRE", "RuntimeError"), # APMPI doesn't support derived metrics either ("e3sm_io_heatmap_only.darshan", "APMPI", "RuntimeError"), ("imbalanced-io.darshan", "POSIX", [["1026", "50.10 GiB", "49.30 GiB", "164.99 MiB/s (average)"]]), ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan", "STDIO", [["1", "0 Bytes", "151 Bytes", "4.22 MiB/s (average)"]]), ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan", "POSIX", [["32", "0 Bytes", "32 Bytes", "0.02 MiB/s (average)"]]), ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan", "STDIO", [["1", "0 Bytes", "1.59 KiB", "16.47 MiB/s (average)"]]), ("e3sm_io_heatmap_only.darshan", "STDIO", [["1", "0 Bytes", "5.80 KiB", "3.26 MiB/s (average)"]]), ("e3sm_io_heatmap_only.darshan", "MPI-IO", [["3", "24.53 MiB", "72.12 GiB", "105.69 MiB/s (average)"]]), ("partial_data_stdio.darshan", "MPI-IO", [["1", "16.00 MiB", "16.00 MiB", "2317.98 MiB/s (average)"]]), ("partial_data_stdio.darshan", "STDIO", [["1022", "0 Bytes", "15.95 GiB", "2999.14 MiB/s (average)"]]), # the C derived metrics code can't distinguish # between different kinds of errors at this time, # but we can still intercept in some cases... ("partial_data_stdio.darshan", "GARBAGE", "ValueError"), ("skew-app.darshan", "POSIX", [["1", "0 Bytes", "40.64 GiB", "157.49 MiB/s (average)"]]), ("skew-app.darshan", "MPI-IO", [["1", "0 Bytes", "40.64 GiB", "55.22 MiB/s (average)"]]), ]) def test_module_overview_table(log_path, mod_name, expected): # test the basic scenario of retrieving # an overview table for a given module log_path = get_log_path(log_path) with darshan.DarshanReport(log_path, read_all=False) as report: if expected == "ValueError": with pytest.raises(ValueError, match=f"mod {mod_name} is not available"): report.mod_read_all_records(mod_name) else: nprocs = report.metadata['job']['nprocs'] if expected == "RuntimeError": # rec_dict not needed to raise this error rec_dict = {} with pytest.raises(RuntimeError, match=f"{mod_name} module does not support derived"): accumulate_records(rec_dict, mod_name, nprocs) else: report.mod_read_all_records(mod_name) rec_dict = report.records[mod_name].to_df() derived_metrics = accumulate_records( rec_dict, mod_name, nprocs).derived_metrics actual_df = log_module_overview_table( derived_metrics=derived_metrics, mod_name=mod_name).df # transpose expected series to get a column of expected data expected_df = pd.DataFrame(expected).T expected_df.index = ["files accessed", "bytes read", "bytes written", "I/O performance estimate"] assert_frame_equal(actual_df, expected_df)