A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/io/pytables/test_select.py

976 lines
33 KiB

from warnings import catch_warnings
import numpy as np
import pytest
from pandas._libs.tslibs import Timestamp
import pandas as pd
from pandas import (
DataFrame,
HDFStore,
Index,
MultiIndex,
Series,
_testing as tm,
bdate_range,
concat,
date_range,
isna,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_path,
ensure_clean_store,
)
from pandas.io.pytables import Term
pytestmark = pytest.mark.single_cpu
def test_select_columns_in_where(setup_path):
# GH 6169
# recreate multi-indexes when columns is passed
# in the `where` argument
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["foo_name", "bar_name"],
)
# With a DataFrame
df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
expected = df[["A"]]
tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
# With a Series
s = Series(np.random.randn(10), index=index, name="A")
with ensure_clean_store(setup_path) as store:
store.put("s", s, format="table")
tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
def test_select_with_dups(setup_path):
# single dtypes
df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"])
df.index = date_range("20130101 9:30", periods=10, freq="T")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=["A"])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# dups across dtypes
df = concat(
[
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
DataFrame(
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
),
],
axis=1,
)
df.index = date_range("20130101 9:30", periods=10, freq="T")
with ensure_clean_store(setup_path) as store:
store.append("df", df)
result = store.select("df")
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
result = store.select("df", columns=df.columns)
expected = df
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["A"]]
result = store.select("df", columns=["A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
expected = df.loc[:, ["B", "A"]]
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
# duplicates on both index and columns
with ensure_clean_store(setup_path) as store:
store.append("df", df)
store.append("df", df)
expected = df.loc[:, ["B", "A"]]
expected = concat([expected, expected])
result = store.select("df", columns=["B", "A"])
tm.assert_frame_equal(result, expected, by_blocks=True)
def test_select(setup_path):
with ensure_clean_store(setup_path) as store:
with catch_warnings(record=True):
# select with columns=
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df")
store.append("df", df)
result = store.select("df", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# equivalently
result = store.select("df", [("columns=['A', 'B']")])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# all a data columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
result = store.select("df", ["A > 0"], columns=["A", "B"])
expected = df[df.A > 0].reindex(columns=["A", "B"])
tm.assert_frame_equal(expected, result)
# with a data column, but different columns
_maybe_remove(store, "df")
store.append("df", df, data_columns=["A"])
result = store.select("df", ["A > 0"], columns=["C", "D"])
expected = df[df.A > 0].reindex(columns=["C", "D"])
tm.assert_frame_equal(expected, result)
def test_select_dtypes(setup_path):
with ensure_clean_store(setup_path) as store:
# with a Timestamp data column (GH #2637)
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.randn(300),
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A"])
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# bool columns (GH #2849)
df = DataFrame(np.random.randn(5, 2), columns=["A", "B"])
df["object"] = "foo"
df.loc[4:5, "object"] = "bar"
df["boolv"] = df["A"] > 0
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa:E712
for v in [True, "true", 1]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa:E712
for v in [False, "false", 0]:
result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
tm.assert_frame_equal(expected, result)
# integer index
df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
_maybe_remove(store, "df_int")
store.append("df_int", df)
result = store.select("df_int", "index<10 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
# float index
df = DataFrame(
{
"A": np.random.rand(20),
"B": np.random.rand(20),
"index": np.arange(20, dtype="f8"),
}
)
_maybe_remove(store, "df_float")
store.append("df_float", df)
result = store.select("df_float", "index<10.0 and columns=['A']")
expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
tm.assert_frame_equal(expected, result)
with ensure_clean_store(setup_path) as store:
# floats w/o NaN
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
store.append("df1", df, data_columns=True)
result = store.select("df1", where="values>2.0")
expected = df[df["values"] > 2.0]
tm.assert_frame_equal(expected, result)
# floats with NaN
df.iloc[0] = np.nan
expected = df[df["values"] > 2.0]
store.append("df2", df, data_columns=True, index=False)
result = store.select("df2", where="values>2.0")
tm.assert_frame_equal(expected, result)
# https://github.com/PyTables/PyTables/issues/282
# bug in selection when 0th row has a np.nan and an index
# store.append('df3',df,data_columns=True)
# result = store.select(
# 'df3', where='values>2.0')
# tm.assert_frame_equal(expected, result)
# not in first position float with NaN ok too
df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[1] = np.nan
expected = df[df["values"] > 2.0]
store.append("df4", df, data_columns=True)
result = store.select("df4", where="values>2.0")
tm.assert_frame_equal(expected, result)
# test selection with comparison against numpy scalar
# GH 11283
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
expected = df[df["A"] > 0]
store.append("df", df, data_columns=True)
np_zero = np.float64(0) # noqa:F841
result = store.select("df", where=["A>np_zero"])
tm.assert_frame_equal(expected, result)
def test_select_with_many_inputs(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"ts": bdate_range("2012-01-01", periods=300),
"A": np.random.randn(300),
"B": range(300),
"users": ["a"] * 50
+ ["b"] * 50
+ ["c"] * 100
+ [f"a{i:03d}" for i in range(100)],
}
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A", "B", "users"])
# regular select
result = store.select("df", "ts>=Timestamp('2012-02-01')")
expected = df[df.ts >= Timestamp("2012-02-01")]
tm.assert_frame_equal(expected, result)
# small selector
result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
expected = df[
(df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
]
tm.assert_frame_equal(expected, result)
# big selector along the columns
selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector")
expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
tm.assert_frame_equal(expected, result)
selector = range(100, 200)
result = store.select("df", "B=selector")
expected = df[df.B.isin(selector)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
# big selector along the index
selector = Index(df.ts[0:100].values)
result = store.select("df", "ts=selector")
expected = df[df.ts.isin(selector.values)]
tm.assert_frame_equal(expected, result)
assert len(result) == 100
def test_select_iterator(setup_path):
# single table
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame(500)
_maybe_remove(store, "df")
store.append("df", df)
expected = store.select("df")
results = list(store.select("df", iterator=True))
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=100))
assert len(results) == 5
result = concat(results)
tm.assert_frame_equal(expected, result)
results = list(store.select("df", chunksize=150))
result = concat(results)
tm.assert_frame_equal(result, expected)
with ensure_clean_path(setup_path) as path:
df = tm.makeTimeDataFrame(500)
df.to_hdf(path, "df_non_table")
msg = "can only use an iterator or chunksize on a table"
with pytest.raises(TypeError, match=msg):
read_hdf(path, "df_non_table", chunksize=100)
with pytest.raises(TypeError, match=msg):
read_hdf(path, "df_non_table", iterator=True)
with ensure_clean_path(setup_path) as path:
df = tm.makeTimeDataFrame(500)
df.to_hdf(path, "df", format="table")
results = list(read_hdf(path, "df", chunksize=100))
result = concat(results)
assert len(results) == 5
tm.assert_frame_equal(result, df)
tm.assert_frame_equal(result, read_hdf(path, "df"))
# multiple
with ensure_clean_store(setup_path) as store:
df1 = tm.makeTimeDataFrame(500)
store.append("df1", df1, data_columns=True)
df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format)
df2["foo"] = "bar"
store.append("df2", df2)
df = concat([df1, df2], axis=1)
# full selection
expected = store.select_as_multiple(["df1", "df2"], selector="df1")
results = list(
store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150)
)
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_complete_8014(setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# no iterator
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/o iteration and no where clause works
result = store.select("df")
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, begin
# of range, works
where = f"index >= '{beg_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, single term, end
# of range, works
where = f"index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# select w/o iterator and where clause, inclusive range,
# works
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
result = store.select("df", where=where)
tm.assert_frame_equal(expected, result)
# with iterator, full range
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[-1]
# select w/iterator and no where clause works
results = list(store.select("df", chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
tm.assert_frame_equal(expected, result)
def test_select_iterator_non_complete_8014(setup_path):
# GH 8014
# using iterator and where clause
chunksize = 1e4
# with iterator, non complete range
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[1]
end_dt = expected.index[-2]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
tm.assert_frame_equal(rexpected, result)
# with iterator, empty where
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100064, "S")
_maybe_remove(store, "df")
store.append("df", expected)
end_dt = expected.index[-1]
# select w/iterator and where clause, single term, begin of range
where = f"index > '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert 0 == len(results)
def test_select_iterator_many_empty_frames(setup_path):
# GH 8014
# using iterator and where clause can return many empty
# frames.
chunksize = 10_000
# with iterator, range limited to the first chunk
with ensure_clean_store(setup_path) as store:
expected = tm.makeTimeDataFrame(100000, "S")
_maybe_remove(store, "df")
store.append("df", expected)
beg_dt = expected.index[0]
end_dt = expected.index[chunksize - 1]
# select w/iterator and where clause, single term, begin of range
where = f"index >= '{beg_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
result = concat(results)
rexpected = expected[expected.index >= beg_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, single term, end of range
where = f"index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
assert len(results) == 1
result = concat(results)
rexpected = expected[expected.index <= end_dt]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause, inclusive range
where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be 1, is 10
assert len(results) == 1
result = concat(results)
rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
tm.assert_frame_equal(rexpected, result)
# select w/iterator and where clause which selects
# *nothing*.
#
# To be consistent with Python idiom I suggest this should
# return [] e.g. `for e in []: print True` never prints
# True.
where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
results = list(store.select("df", where=where, chunksize=chunksize))
# should be []
assert len(results) == 0
def test_frame_select(setup_path):
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
store.put("frame", df, format="table")
date = df.index[len(df) // 2]
crit1 = Term("index>=date")
assert crit1.env.scope["date"] == date
crit2 = "columns=['A', 'D']"
crit3 = "columns=A"
result = store.select("frame", [crit1, crit2])
expected = df.loc[date:, ["A", "D"]]
tm.assert_frame_equal(result, expected)
result = store.select("frame", [crit3])
expected = df.loc[:, ["A"]]
tm.assert_frame_equal(result, expected)
# invalid terms
df = tm.makeTimeDataFrame()
store.append("df_time", df)
msg = "could not convert string to Timestamp"
with pytest.raises(ValueError, match=msg):
store.select("df_time", "index>0")
# can't select if not written as table
# store['frame'] = df
# with pytest.raises(ValueError):
# store.select('frame', [crit1, crit2])
def test_frame_select_complex(setup_path):
# select via complex criteria
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df.loc[df.index[0:4], "string"] = "bar"
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table", data_columns=["string"])
# empty
result = store.select("df", 'index>df.index[3] & string="bar"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select("df", 'index>df.index[3] & string="foo"')
expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
tm.assert_frame_equal(result, expected)
# or
result = store.select("df", 'index>df.index[3] | string="bar"')
expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
tm.assert_frame_equal(result, expected)
result = store.select(
"df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
)
expected = df.loc[
((df.index > df.index[3]) & (df.index <= df.index[6]))
| (df.string == "bar")
]
tm.assert_frame_equal(result, expected)
# invert
result = store.select("df", 'string!="bar"')
expected = df.loc[df.string != "bar"]
tm.assert_frame_equal(result, expected)
# invert not implemented in numexpr :(
msg = "cannot use an invert condition when passing to numexpr"
with pytest.raises(NotImplementedError, match=msg):
store.select("df", '~(string="bar")')
# invert ok for filters
result = store.select("df", "~(columns=['A','B'])")
expected = df.loc[:, df.columns.difference(["A", "B"])]
tm.assert_frame_equal(result, expected)
# in
result = store.select("df", "index>df.index[3] & columns in ['A','B']")
expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_frame_select_complex2(setup_path):
with ensure_clean_path(["params.hdf", "hist.hdf"]) as paths:
pp, hh = paths
# use non-trivial selection criteria
params = DataFrame({"A": [1, 1, 2, 2, 3]})
params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"])
selection = read_hdf(pp, "df", where="A=[2,3]")
hist = DataFrame(
np.random.randn(25, 1),
columns=["data"],
index=MultiIndex.from_tuples(
[(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
),
)
hist.to_hdf(hh, "df", mode="w", format="table")
expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
# scope with list like
l0 = selection.index.tolist() # noqa:F841
store = HDFStore(hh)
result = store.select("df", where="l1=l0")
tm.assert_frame_equal(result, expected)
store.close()
result = read_hdf(hh, "df", where="l1=l0")
tm.assert_frame_equal(result, expected)
# index
index = selection.index # noqa:F841
result = read_hdf(hh, "df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = read_hdf(hh, "df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
# scope with index
store = HDFStore(hh)
result = store.select("df", where="l1=index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=selection.index.tolist()")
tm.assert_frame_equal(result, expected)
result = store.select("df", where="l1=list(selection.index)")
tm.assert_frame_equal(result, expected)
store.close()
def test_invalid_filtering(setup_path):
# can't use more than one filter (atm)
df = tm.makeTimeDataFrame()
with ensure_clean_store(setup_path) as store:
store.put("df", df, format="table")
msg = "unable to collapse Joint Filters"
# not implemented
with pytest.raises(NotImplementedError, match=msg):
store.select("df", "columns=['A'] | columns=['B']")
# in theory we could deal with this
with pytest.raises(NotImplementedError, match=msg):
store.select("df", "columns=['A','B'] & columns=['C']")
def test_string_select(setup_path):
# GH 2973
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame()
# test string ==/!=
df["x"] = "none"
df.loc[df.index[2:7], "x"] = ""
store.append("df", df, data_columns=["x"])
result = store.select("df", "x=none")
expected = df[df.x == "none"]
tm.assert_frame_equal(result, expected)
result = store.select("df", "x!=none")
expected = df[df.x != "none"]
tm.assert_frame_equal(result, expected)
df2 = df.copy()
df2.loc[df2.x == "", "x"] = np.nan
store.append("df2", df2, data_columns=["x"])
result = store.select("df2", "x!=none")
expected = df2[isna(df2.x)]
tm.assert_frame_equal(result, expected)
# int ==/!=
df["int"] = 1
df.loc[df.index[2:7], "int"] = 2
store.append("df3", df, data_columns=["int"])
result = store.select("df3", "int=2")
expected = df[df.int == 2]
tm.assert_frame_equal(result, expected)
result = store.select("df3", "int!=2")
expected = df[df.int != 2]
tm.assert_frame_equal(result, expected)
def test_select_as_multiple(setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df2["foo"] = "bar"
with ensure_clean_store(setup_path) as store:
msg = "keys must be a list/tuple"
# no tables stored
with pytest.raises(TypeError, match=msg):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
store.append("df1", df1, data_columns=["A", "B"])
store.append("df2", df2)
# exceptions
with pytest.raises(TypeError, match=msg):
store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
with pytest.raises(TypeError, match=msg):
store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
msg = "'No object named df3 in the file'"
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
with pytest.raises(KeyError, match=msg):
store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
with pytest.raises(KeyError, match="'No object named df4 in the file'"):
store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df4"
)
# default select
result = store.select("df1", ["A>0", "B>0"])
expected = store.select_as_multiple(
["df1"], where=["A>0", "B>0"], selector="df1"
)
tm.assert_frame_equal(result, expected)
expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1")
tm.assert_frame_equal(result, expected)
# multiple
result = store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
)
expected = concat([df1, df2], axis=1)
expected = expected[(expected.A > 0) & (expected.B > 0)]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
# multiple (diff selector)
result = store.select_as_multiple(
["df1", "df2"], where="index>df2.index[4]", selector="df2"
)
expected = concat([df1, df2], axis=1)
expected = expected[5:]
tm.assert_frame_equal(result, expected)
# test exception for diff rows
store.append("df3", tm.makeTimeDataFrame(nper=50))
msg = "all tables must have exactly the same nrows!"
with pytest.raises(ValueError, match=msg):
store.select_as_multiple(
["df1", "df3"], where=["A>0", "B>0"], selector="df1"
)
def test_nan_selection_bug_4858(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[0] = np.nan
expected = DataFrame(
{"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
index=[3, 4, 5],
)
# write w/o the index on that particular column
store.append("df", df, data_columns=True, index=["cols"])
result = store.select("df", where="values>2.0")
tm.assert_frame_equal(result, expected)
def test_query_with_nested_special_character(setup_path):
df = DataFrame(
{
"a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
"b": [1, 2, 3, 4, 5, 6, 7, 8],
}
)
expected = df[df.a == "test & test"]
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
result = store.select("test", 'a = "test & test"')
tm.assert_frame_equal(expected, result)
def test_query_long_float_literal(setup_path):
# GH 14241
df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
cutoff = 1000000000.0006
result = store.select("test", f"A < {cutoff:.4f}")
assert result.empty
cutoff = 1000000000.0010
result = store.select("test", f"A > {cutoff:.4f}")
expected = df.loc[[1, 2], :]
tm.assert_frame_equal(expected, result)
exact = 1000000000.0011
result = store.select("test", f"A == {exact:.4f}")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
def test_query_compare_column_type(setup_path):
# GH 15492
df = DataFrame(
{
"date": ["2014-01-01", "2014-01-02"],
"real_date": date_range("2014-01-01", periods=2),
"float": [1.1, 1.2],
"int": [1, 2],
},
columns=["date", "real_date", "float", "int"],
)
with ensure_clean_store(setup_path) as store:
store.append("test", df, format="table", data_columns=True)
ts = Timestamp("2014-01-01") # noqa:F841
result = store.select("test", where="real_date > ts")
expected = df.loc[[1], :]
tm.assert_frame_equal(expected, result)
for op in ["<", ">", "=="]:
# non strings to string column always fail
for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
query = f"date {op} v"
msg = f"Cannot compare {v} of type {type(v)} to string column"
with pytest.raises(TypeError, match=msg):
store.select("test", where=query)
# strings to other columns must be convertible to type
v = "a"
for col in ["int", "float", "real_date"]:
query = f"{col} {op} v"
msg = "could not convert string to "
with pytest.raises(ValueError, match=msg):
store.select("test", where=query)
for v, col in zip(
["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
):
query = f"{col} {op} v"
result = store.select("test", where=query)
if op == "==":
expected = df.loc[[0], :]
elif op == ">":
expected = df.loc[[1], :]
else:
expected = df.loc[[], :]
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
def test_select_empty_where(where):
# GH26610
df = DataFrame([1, 2, 3])
with ensure_clean_path("empty_where.h5") as path:
with HDFStore(path) as store:
store.put("df", df, "t")
result = read_hdf(store, "df", where=where)
tm.assert_frame_equal(result, df)