import os
import numpy as np
from numpy.testing import assert_allclose
import pytest
import pandas as pd
from pandas.testing import assert_series_equal
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import darshan
from darshan.experimental.plots import data_access_by_filesystem
from darshan.log_utils import get_log_path
[docs]
@pytest.mark.parametrize("series, expected_series", [
# a Series with a single filesystem root path
# but the other root paths are absent
(pd.Series([1], index=['/yellow']),
# we expect the missing filesystem roots to get
# added in with values of 0
pd.Series([1, 0, 0], index=['/yellow', '/tmp', '/home'], dtype=np.float64)
),
# a Series with two filesystem root paths,
# but the other root path is absent
(pd.Series([1, 3], index=['/yellow', '/tmp']),
# we expect the single missing root path to get
# added in with a value of 0
pd.Series([1, 3, 0], index=['/yellow', '/tmp', '/home'], dtype=np.float64),
),
# a Series with all filesystem root paths
# present
(pd.Series([1, 3, 2], index=['/yellow', '/tmp', '/home']),
# if all root paths are already accounted for in the
# Series, it will be just fine for plotting so can remain
# unchanged
pd.Series([1, 3, 2], index=['/yellow', '/tmp', '/home'], dtype=np.float64),
),
# a Series with only the final filesystem root path
(pd.Series([2], index=['/home']),
# we expect the order of the indices to be
# preserved from the filesystem_roots provided
# and 0 values filled in where needed
pd.Series([0, 0, 2], index=['/yellow', '/tmp', '/home'], dtype=np.float64),
),
])
def test_empty_series_handler(series, expected_series):
# the empty_series_handler() function should
# add indices for any filesystems that are missing
# from a given Series, along with values of 0 for
# each of those indices (i.e., no activity for that
# missing filesystem)--this is mostly to enforce
# consistent plotting behavior
filesystem_roots = ['/yellow', '/tmp', '/home']
actual_series = data_access_by_filesystem.empty_series_handler(series=series,
filesystem_roots=filesystem_roots)
assert_series_equal(actual_series, expected_series)
[docs]
@pytest.mark.parametrize("file_path, expected_root_path", [
("/scratch1/scratchdirs/glock/testFile.00000046",
"/scratch1"),
])
def test_convert_file_path_to_root_path(file_path, expected_root_path):
actual_root_path = data_access_by_filesystem.convert_file_path_to_root_path(file_path=file_path)
assert actual_root_path == expected_root_path
[docs]
@pytest.mark.parametrize("input_id, file_id_dict, expected_file_path", [
(9.457796068806373e+18,
{210703578647777632: '/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out.locktest.0',
9457796068806373448: '/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out'},
'/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out'
),
# intentionally use an ID that is absent
# in the dictionary
(9.357796068806371e+18,
{210703578647777632: '/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out.locktest.0',
9457796068806373448: '/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out'},
None
),
])
def test_convert_file_id_to_path(input_id, file_id_dict, expected_file_path):
file_id_hash_arr, file_path_arr = data_access_by_filesystem.convert_id_dict_to_arrays(file_id_dict=file_id_dict)
actual_file_path = data_access_by_filesystem.convert_file_id_to_path(input_id=input_id,
file_hashes=file_id_hash_arr,
file_paths=file_path_arr)
assert actual_file_path == expected_file_path
[docs]
@pytest.mark.parametrize("verbose", [True, False])
@pytest.mark.parametrize("file_id_dict, expected_root_paths", [
({210703578647777632: '/yellow/usr/projects/eap/users/treddy/simple_dxt_mpi_io_darshan/test.out.locktest.0',
14388265063268455899: '/tmp/ompi.sn176.28751/jf.29186/1/test.out_cid-0-3400.sm'},
['/yellow', '/tmp']),
])
def test_identify_filesystems(capsys, file_id_dict, expected_root_paths, verbose):
actual_root_paths = data_access_by_filesystem.identify_filesystems(file_id_dict=file_id_dict,
verbose=verbose)
assert actual_root_paths == expected_root_paths
captured = capsys.readouterr()
if verbose:
# check that the same root paths
# are also printed
for root_path in actual_root_paths:
assert root_path in captured.out
else:
# nothing should be printed
assert len(captured.out) == 0
[docs]
@pytest.mark.parametrize("""log_path,
expected_df_reads_shape,
expected_df_writes_shape""", [
(get_log_path("sample.darshan"),
(0, 87),
(3, 87),
),
(get_log_path("sample-dxt-simple.darshan"),
(0, 73),
(2, 73),
),
])
def test_rec_to_rw_counter_dfs_with_cols(log_path,
expected_df_reads_shape,
expected_df_writes_shape):
# check basic shape expectations on the dataframes
# produced by rec_to_rw_counter_dfs_with_cols()
with darshan.DarshanReport(log_path) as report:
file_id_dict = report.data["name_records"]
actual_df_reads, actual_df_writes = data_access_by_filesystem.rec_to_rw_counter_dfs_with_cols(report=report,
file_id_dict=file_id_dict,
mod='POSIX')
assert actual_df_reads.shape == expected_df_reads_shape
assert actual_df_writes.shape == expected_df_writes_shape
[docs]
@pytest.mark.parametrize("read_groups, write_groups, filesystem_roots, expected_read_groups, expected_write_groups", [
(pd.Series([0, 1, 7], index=['/root', '/tmp', '/yellow']),
pd.Series([5, 5], index=['/root', '/tmp']),
['/root', '/tmp', '/yellow', '/usr', '/scratch1'],
pd.Series([0, 1, 7, 0, 0], index=['/root', '/tmp', '/yellow', '/usr', '/scratch1'], dtype=np.float64),
pd.Series([5, 5, 0, 0, 0], index=['/root', '/tmp', '/yellow', '/usr', '/scratch1'], dtype=np.float64),
),
])
def test_check_empty_series(read_groups,
write_groups,
filesystem_roots,
expected_read_groups,
expected_write_groups):
# check that the reindex operation happened as
# expected
actual_read_groups, actual_write_groups = data_access_by_filesystem.check_empty_series(read_groups=read_groups,
write_groups=write_groups,
filesystem_roots=filesystem_roots)
assert_series_equal(actual_read_groups, expected_read_groups)
assert_series_equal(actual_write_groups, expected_write_groups)
[docs]
@pytest.mark.parametrize("df_reads, df_writes, expected_read_groups, expected_write_groups", [
(pd.DataFrame({'filesystem_root': ['/yellow', '/tmp', '/yellow'],
'POSIX_BYTES_READ': [3, 5, 90],
'POSIX_BYTES_WRITTEN': [0, 9, 0],
'COLUMN3': [np.nan, 5, 8],
'COLUMN4': ['a', 'b', 'c']}),
pd.DataFrame({'filesystem_root': ['/yellow', '/tmp', '/tmp'],
'POSIX_BYTES_READ': [1, 11, 17],
'POSIX_BYTES_WRITTEN': [2098, 9, 20],
'COLUMN3': [np.nan, 5, 1],
'COLUMN4': ['a', 'b', 'd']}),
pd.Series([5, 93], index=pd.Index(['/tmp', '/yellow'], name="filesystem_root"), name='BYTES_READ'),
pd.Series([29, 2098], index=pd.Index(['/tmp', '/yellow'], name="filesystem_root"), name='BYTES_WRITTEN'),
),
])
def test_process_byte_counts(df_reads, df_writes, expected_read_groups, expected_write_groups):
actual_read_groups, actual_write_groups = data_access_by_filesystem.process_byte_counts(df_reads=df_reads,
df_writes=df_writes)
assert_series_equal(actual_read_groups, expected_read_groups)
assert_series_equal(actual_write_groups, expected_write_groups)
[docs]
@pytest.mark.parametrize("df_reads, df_writes, expected_read_groups, expected_write_groups", [
(pd.DataFrame({'filesystem_root': ['/yellow', '/tmp', '/yellow'],
'filepath': ['/yellow/file1', '/tmp/file2', '/yellow/file3'],
'POSIX_BYTES_READ': [3, 5, 90],
'POSIX_BYTES_WRITTEN': [0, 9, 0],
'COLUMN3': [np.nan, 5, 8],
'COLUMN4': ['a', 'b', 'c']}),
pd.DataFrame({'filesystem_root': ['/yellow', '/tmp', '/tmp'],
'filepath': ['/yellow/file4', '/tmp/file5', '/tmp/file19'],
'POSIX_BYTES_READ': [1, 11, 17],
'POSIX_BYTES_WRITTEN': [2098, 9, 20],
'COLUMN3': [np.nan, 5, 1],
'COLUMN4': ['a', 'b', 'd']}),
pd.Series([1, 2], index=pd.Index(['/tmp', '/yellow'], name="filesystem_root"), name='filepath'),
pd.Series([2, 1], index=pd.Index(['/tmp', '/yellow'], name="filesystem_root"), name='filepath'),
),
])
def test_process_unique_files(df_reads, df_writes, expected_read_groups, expected_write_groups):
actual_read_groups, actual_write_groups = data_access_by_filesystem.process_unique_files(df_reads=df_reads,
df_writes=df_writes)
assert_series_equal(actual_read_groups, expected_read_groups)
assert_series_equal(actual_write_groups, expected_write_groups)
[docs]
@pytest.mark.parametrize("mod", ["POSIX", "OTHER"])
@pytest.mark.parametrize("verbose", [True, False])
@pytest.mark.parametrize("""log_path,
processing_func,
expected_read_groups,
expected_write_groups""", [
(get_log_path("sample.darshan"),
data_access_by_filesystem.process_unique_files,
pd.Series([0.0, 0.0, 0.0, 0.0], index=pd.Index(['<STDIN>', '<STDOUT>', '<STDERR>', '/scratch2'], name='filesystem_root'), name='filepath'),
pd.Series([0.0, 1.0, 1.0, 1.0], index=pd.Index(['<STDIN>', '<STDOUT>', '<STDERR>', '/scratch2'], name='filesystem_root'), name='filepath')),
])
def test_unique_fs_rw_counter(log_path,
processing_func,
verbose,
expected_read_groups,
expected_write_groups,
mod):
with darshan.DarshanReport(log_path) as report:
file_id_dict = report.data["name_records"]
filesystem_roots = data_access_by_filesystem.identify_filesystems(report.data["name_records"])
if mod == "POSIX":
actual_read_groups, actual_write_groups = data_access_by_filesystem.unique_fs_rw_counter(report=report,
filesystem_roots=filesystem_roots,
file_id_dict=file_id_dict,
processing_func=processing_func,
mod=mod,
verbose=verbose)
assert_series_equal(actual_read_groups, expected_read_groups)
assert_series_equal(actual_write_groups, expected_write_groups)
else:
with pytest.raises(NotImplementedError):
data_access_by_filesystem.unique_fs_rw_counter(report=report,
filesystem_roots=filesystem_roots,
file_id_dict=file_id_dict,
processing_func=processing_func,
mod=mod,
verbose=verbose)
[docs]
@pytest.mark.parametrize("""file_rd_series,
file_wr_series,
bytes_rd_series,
bytes_wr_series,
filesystem_roots
""", [
(pd.Series([3.0], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
pd.Series([14.0], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
pd.Series([2.145206e+09], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_READ'),
pd.Series([1.010878e+12], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_WRITTEN'),
['/p'],
),
])
def test_plot_data(file_rd_series, file_wr_series, bytes_rd_series, bytes_wr_series, filesystem_roots):
# test a few basic properties of the main plotting function
fig = plt.figure()
data_access_by_filesystem.plot_data(fig=fig,
file_rd_series=file_rd_series,
file_wr_series=file_wr_series,
bytes_rd_series=bytes_rd_series,
bytes_wr_series=bytes_wr_series,
filesystem_roots=filesystem_roots)
axes = fig.gca()
children = axes.get_children()
actual_list_text_in_fig = []
# accumulate text added via ax.text()
# by the function
for child in children:
if isinstance(child, matplotlib.text.Text):
actual_list_text_in_fig.append(child.get_text())
for expected_text_entry in [matplotlib.text.Text(0, 1, ' files read: 3'),
matplotlib.text.Text(0, 0, ' files written: 14')]:
assert expected_text_entry.get_text() in actual_list_text_in_fig
# enforce invisibile right-side spine so that
# there is no overlap between value labels and
# the plot frame on the right side
for ax in fig.axes:
spines = ax.spines
right_spine_visibility = spines['right'].get_visible()
assert not right_spine_visibility
[docs]
def test_empty_data_posix_y_axis_annot_position():
# the y-axis filesystem annotations were observed
# to cross the left side spine and overlap onto the plot
# proper in gh-397, when using a log file that lacks
# POSIX data
# verify that this is handled/resolved
log_file_path = get_log_path('noposixopens.darshan')
with darshan.DarshanReport(log_file_path) as report:
actual_fig = data_access_by_filesystem.plot_with_report(report=report)
# check that the y annotation font sizes have been
# adjusted based on the length of the strings
axes = actual_fig.axes
for ax in axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Annotation):
actual_text = child.get_text()
actual_fontsize = child.get_fontsize()
assert actual_fontsize == 18
[docs]
@pytest.mark.parametrize("log_file_name, expected_text_labels", [
('noposixopens.darshan', ['/global', 'anonymized']),
('sample.darshan', ['/scratch2', '<STDERR>', '<STDOUT>']),
# test case for gh-678
('mpi-io-test.darshan', ['/global', '<STDOUT>']),
])
def test_cat_labels_std_streams(log_file_name, expected_text_labels):
# for an anonymized log file that operates on STDIO, STDERR
# and STDIN, we want appropriate labels to be used instead of confusing
# integers on y axis; for the same scenario without anonymization,
# the STD.. stream label seem appropriate
log_file_path = get_log_path(log_file_name)
actual_text_labels = []
with darshan.DarshanReport(log_file_path) as report:
actual_fig = data_access_by_filesystem.plot_with_report(report=report)
axes = actual_fig.axes
for ax in axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Annotation):
actual_text = child.get_text()
actual_text_labels.append(actual_text)
assert actual_text_labels == expected_text_labels
[docs]
def test_empty_data_posix_text_position():
# the bytes and files read/written text labels
# were observed to be too far to the right in the
# subplots for a log file lacking POSIX activity
# in gh-397; regression test this issue
log_file_path = get_log_path('noposixopens.darshan')
with darshan.DarshanReport(log_file_path) as report:
actual_fig = data_access_by_filesystem.plot_with_report(report=report)
axes = actual_fig.axes
for ax in axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Text):
actual_text = child.get_text()
# check for correct axis coordinate
# positions
if 'read' in actual_text:
assert_allclose(child.get_position(), (0, 0.75))
elif 'written' in actual_text:
assert_allclose(child.get_position(), (0, 0.25))
[docs]
@pytest.mark.parametrize("""file_rd_series,
file_wr_series,
bytes_rd_series,
bytes_wr_series,
filesystem_roots
""", [
(pd.Series([1], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
pd.Series([1], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
pd.Series([1.049e+6], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_READ'),
pd.Series([1.049e+6], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_WRITTEN'),
['/p'],
),
# test case where files read/written are zero
(pd.Series([0], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
pd.Series([0], index=pd.Index(['/p'], name='filesystem_root'), name='filepath'),
# NOTE: very strange to be able to read/write bytes to
# a filesystem and yet have no files read or written
# to on that filesystem (this might be an error someday?)
# see comment:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r683621305
pd.Series([1.049e+6], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_READ'),
pd.Series([1.049e+6], index=pd.Index(['/p'], name='filesystem_root'), name='POSIX_BYTES_WRITTEN'),
['/p'],
),
])
def test_plot_data_labels(file_rd_series, file_wr_series, bytes_rd_series, bytes_wr_series, filesystem_roots):
# regression test for label spacing in plot
# based on review comment in gh-397
fig = plt.figure()
data_access_by_filesystem.plot_data(fig=fig,
file_rd_series=file_rd_series,
file_wr_series=file_wr_series,
bytes_rd_series=bytes_rd_series,
bytes_wr_series=bytes_wr_series,
filesystem_roots=filesystem_roots)
for ax in fig.axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Text):
actual_text = child.get_text()
if actual_text not in ['/p', '']:
# count the leading spaces for each label
leading_spaces = len(actual_text) - len(actual_text.lstrip(' '))
# check there is always 1 leading space for each label
assert leading_spaces == 1
[docs]
def test_plot_data_shared_x_axis():
# regression test for case described here:
# https://github.com/darshan-hpc/darshan/pull/397#pullrequestreview-717403104
# https://github.com/darshan-hpc/darshan/pull/397#issuecomment-889504530
filesystem_roots = ['/usr', '/yellow', '/green', '/global']
rd_bytes = [1e7, 1e8, 1e9, 1e10]
wr_bytes = [1e8, 1e9, 1e10, 1e11]
rd_file_cts = [1e3, 1e4, 1e5, 1e6]
wr_file_cts = [1e2, 1e3, 1e4, 1e5]
bytes_rd_series = pd.Series(data=rd_bytes, index=filesystem_roots)
bytes_wr_series = pd.Series(data=wr_bytes, index=filesystem_roots)
file_rd_series = pd.Series(data=rd_file_cts, index=filesystem_roots)
file_wr_series = pd.Series(data=wr_file_cts, index=filesystem_roots)
fig = plt.figure()
data_access_by_filesystem.plot_data(fig,
file_rd_series,
file_wr_series,
bytes_rd_series,
bytes_wr_series,
filesystem_roots)
# enforce shared log x axes in a given column
bytes_column_x_axis_limits = []
files_column_x_axis_limits = []
for i, ax in enumerate(fig.axes):
if i % 2 == 0:
bytes_column_x_axis_limits.append(ax.get_xlim())
else:
files_column_x_axis_limits.append(ax.get_xlim())
# also check for absence of ticklabels
for label in ax.get_xticklabels(which='both'):
assert len(label.get_text()) == 0
for label in ax.get_yticklabels(which='both'):
assert len(label.get_text()) == 0
for limits in [bytes_column_x_axis_limits,
files_column_x_axis_limits]:
# matching axes:
diff = np.diff(limits, axis=0)
assert_allclose(diff, 0)
# log scale values:
assert_allclose(np.array(bytes_column_x_axis_limits)[..., 1], 3.89496945e+11)
assert_allclose(np.array(files_column_x_axis_limits)[..., 1], 2190302.282682)
# check for log scaling in both columns
for i, axis in enumerate(fig.axes):
if i in [6, 7]:
assert 'symmetric log scaled' in axis.get_xlabel()
else:
assert axis.get_xlabel() == ''
[docs]
@pytest.mark.parametrize('filename', ['imbalanced-io.darshan'])
def test_log_scale_display(filename):
# plot columns that are log scaled should be
# labelled appropriately
log_path = get_log_path(filename)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report)
# only index 8 should have the log axis label
for i, axis in enumerate(fig.axes):
if i == 8:
assert 'symmetric log scaled' in axis.get_xlabel()
else:
assert axis.get_xlabel() == ''
[docs]
@pytest.mark.parametrize('filename, expected_dims, num_cats',
[('imbalanced-io.darshan', [12, 16], None),
('imbalanced-io.darshan', [12, 16], 3),
('imbalanced-io.darshan', [12, 16], 1),
('snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan',
[12, 16], None)])
def test_vertical_resize(filename, expected_dims, num_cats):
# ensure that plots are expanded vertically to
# match the number of filesystems plotted
log_path = get_log_path(filename)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report,
num_cats=num_cats)
actual_dims = fig.get_size_inches()
assert_allclose(actual_dims, expected_dims)
[docs]
@pytest.mark.parametrize("logname", [
"mpi-io-test.darshan",
"treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
])
def test_annotate_center_align(logname):
# for review comment here:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r690847889
logpath = get_log_path(logname)
with darshan.DarshanReport(logpath) as report:
fig = data_access_by_filesystem.plot_with_report(report=report)
axes = fig.axes
for ax in axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Annotation):
assert child.get_verticalalignment() == "center"
[docs]
@pytest.mark.parametrize("logname", [
"imbalanced-io.darshan",
"mpi-io-test.darshan",
])
def test_text_center_align(logname):
# for review comment here:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r690755364
logpath = get_log_path(logname)
with darshan.DarshanReport(logpath) as report:
fig = data_access_by_filesystem.plot_with_report(report=report)
axes = fig.axes
for ax in axes:
for child in ax.get_children():
if isinstance(child, matplotlib.text.Text):
actual_text = child.get_text()
if "read" in actual_text or "written" in actual_text:
assert child.get_verticalalignment() == "center"
[docs]
@pytest.mark.parametrize("logname", [
"nonmpi_dxt_anonymized.darshan",
"partial_data_stdio.darshan",
"treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
])
@pytest.mark.parametrize("num_cats", [2, 8])
def test_subplot_restriction(logname, num_cats):
# for review comment here:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r779176814
# the number of subplots in a figure should
# be consistent (<=) with the number of category
# rows requested, and avoid a collapsed layout
expected_axes_limit = num_cats * 2
log_path = get_log_path(logname)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report,
num_cats=num_cats)
actual_axes = fig.get_axes()
assert len(actual_axes) <= expected_axes_limit
max_y1 = 0
min_y1 = np.inf
for ax in actual_axes:
y1 = ax.get_position().y1
if y1 > max_y1:
max_y1 = y1
if y1 < min_y1:
min_y1 = y1
# this spread helps ensure avoidance of
# a collapsed layout of subplots
assert (max_y1 - min_y1) > 0.2
[docs]
@pytest.mark.parametrize("logname", [
"partial_data_dxt.darshan",
"partial_data_stdio.darshan",
])
def test_plot_with_report_no_file(tmpdir, logname):
# plot_with_report should only return a figure, and
# not generate a `.png` file
# see review comment:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r689859765
with tmpdir.as_cwd():
log_path = get_log_path(logname)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report,
num_cats=6)
files_in_tmp = os.listdir(".")
assert not files_in_tmp
[docs]
@pytest.mark.parametrize("logname, top_cat_name, third_cat_name", [
# spot check the 1st and 3rd most active
# categories for each case
("imbalanced-io.darshan", "/lus", "anonymized"),
("nonmpi_dxt_anonymized.darshan", "/", "anonymized"),
])
def test_plot_with_report_proper_sort(logname, top_cat_name, third_cat_name):
# we want to sort categories in descending order of activity
# (bytes read + bytes written), which is especially important
# when using `num_cats` for `plot_with_report()`, otherwise
# we could end up with only i.e., inactive categories/filesystems
# displayed
# see review comment:
# https://github.com/darshan-hpc/darshan/pull/397#discussion_r769186581
log_path = get_log_path(logname)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report,
num_cats=6)
actual_axes = fig.get_axes()
for i, ax in enumerate(actual_axes):
for child in ax.get_children():
if isinstance(child, matplotlib.text.Annotation):
if i == 0:
assert child.get_text() == top_cat_name
elif i == 4:
assert child.get_text() == third_cat_name
[docs]
@pytest.mark.parametrize("logname", [
"imbalanced-io.darshan",
"nonmpi_dxt_anonymized.darshan",
])
def test_plot_with_report_root_files(logname):
# regression test for a bug that resulted in several
# categories that started with "//" for root-mounted
# files
log_path = get_log_path(logname)
with darshan.DarshanReport(log_path) as report:
fig = data_access_by_filesystem.plot_with_report(report=report)
actual_axes = fig.get_axes()
for i, ax in enumerate(actual_axes):
for child in ax.get_children():
if isinstance(child, matplotlib.text.Annotation):
assert not child.get_text().startswith("//")
[docs]
@pytest.mark.parametrize("""logname,
expected_file_rd_series,
expected_file_wr_series,
expected_bytes_rd_series,
expected_bytes_wr_series""", [
("ior_hdf5_example.darshan",
pd.Series({"<STDIN>": 0.0,
"<STDOUT>": 0.0,
"<STDERR>": 0.0,
"/global": 1.0}),
pd.Series({"<STDIN>": 0.0,
"<STDOUT>": 1.0,
"<STDERR>": 0.0,
"/global": 1.0}),
pd.Series({"<STDIN>": 0.0,
"<STDOUT>": 0.0,
"<STDERR>": 0.0,
"/global": 4202504.0}),
pd.Series({"<STDIN>": 0.0,
"<STDOUT>": 2421.0,
"<STDERR>": 0.0,
"/global": 4195800.0}),
),
])
def test_stdio_basic_inclusion(logname,
expected_file_rd_series,
expected_file_wr_series,
expected_bytes_rd_series,
expected_bytes_wr_series):
for series in [expected_file_rd_series,
expected_file_wr_series,
expected_bytes_rd_series,
expected_bytes_wr_series]:
series.index.name = "filesystem_root"
series.name = "filepath"
expected_bytes_rd_series.name = "BYTES_READ"
expected_bytes_wr_series.name = "BYTES_WRITTEN"
# test for the inclusin of STDIO module
# data in the accounting of files/bytes read/written
# (the original "data access by category" implementation
# was POSIX-only)
# follow the basic setup in plot_with_report()
log_path = get_log_path(logname)
with darshan.DarshanReport(log_path) as report:
file_id_dict = report.data["name_records"]
filesystem_roots = data_access_by_filesystem.identify_filesystems(file_id_dict=file_id_dict)
# now, we expect the files and bytes data structures
# to properly account for STDIO + POSIX data
file_rd_series, file_wr_series = data_access_by_filesystem.unique_fs_rw_counter(report=report,
filesystem_roots=filesystem_roots,
file_id_dict=file_id_dict,
processing_func=data_access_by_filesystem.process_unique_files,
mod='POSIX')
bytes_rd_series, bytes_wr_series = data_access_by_filesystem.unique_fs_rw_counter(report=report,
filesystem_roots=filesystem_roots,
file_id_dict=file_id_dict,
processing_func=data_access_by_filesystem.process_byte_counts,
mod='POSIX')
assert_series_equal(file_rd_series, expected_file_rd_series)
assert_series_equal(file_wr_series, expected_file_wr_series)
assert_series_equal(bytes_rd_series, expected_bytes_rd_series)
assert_series_equal(bytes_wr_series, expected_bytes_wr_series)
[docs]
def test_plot_with_empty_data():
# generate a report object that filters out all contained records
# to ensure data access by category plot properly returns None instead of failing
logpath = get_log_path("ior_hdf5_example.darshan")
# use a bogus regex with the "include" filter mode to ensure no records are included
with darshan.DarshanReport(logpath, filter_patterns=["bogus-regex"], filter_mode="include") as report:
fig = data_access_by_filesystem.plot_with_report(report=report)
assert fig == None
[docs]
def test_with_filtered_data():
# ensure get_io_cost_df doesn't include data for modules with no records
logpath = get_log_path("sample-badost.darshan")
# generate a report object with all STDIO module records filtered out
# POSIX records should still remain
with darshan.DarshanReport(logpath, filter_patterns=["ior-posix"], filter_mode="include") as report:
file_id_dict = report.data["name_records"]
actual_df_reads, actual_df_writes = data_access_by_filesystem.rec_to_rw_counter_dfs_with_cols(report=report,
file_id_dict=file_id_dict)
assert len(actual_df_reads) == 0
assert len(actual_df_writes) == 2048
# generate a report object with all POSIX module records filtered out
# STDIO records should still remain
with darshan.DarshanReport(logpath, filter_patterns=["ior-posix"], filter_mode="exclude") as report:
file_id_dict = report.data["name_records"]
actual_df_reads, actual_df_writes = data_access_by_filesystem.rec_to_rw_counter_dfs_with_cols(report=report,
file_id_dict=file_id_dict)
assert len(actual_df_reads) == 1
assert len(actual_df_writes) == 2