A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/io/json/test_readlines.py

298 lines
9.5 KiB

from io import StringIO
from pathlib import Path
import pytest
import pandas as pd
from pandas import (
DataFrame,
read_json,
)
import pandas._testing as tm
from pandas.io.json._json import JsonReader
@pytest.fixture
def lines_json_df():
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
return df.to_json(lines=True, orient="records")
def test_read_jsonl():
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_datetime():
# GH33787
df = DataFrame(
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
columns=["accounts", "date", "name"],
)
json_line = df.to_json(lines=True, orient="records")
result = read_json(json_line)
expected = DataFrame(
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
columns=["accounts", "date", "name"],
)
tm.assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(result, lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(result, lines=True), df)
def test_to_jsonl_count_new_lines():
# GH36888
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
expected_new_lines_count = 2
assert actual_new_lines_count == expected_new_lines_count
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(lines_json_df, chunksize):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
unchunked = read_json(StringIO(lines_json_df), lines=True)
with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader:
chunked = pd.concat(reader)
tm.assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df):
msg = "chunksize can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
with read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _:
pass
def test_readjson_chunks_series():
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({"A": 1, "B": 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series")
strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(strio, lines=True, typ="Series", chunksize=1) as reader:
chunked = pd.concat(reader)
tm.assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(lines_json_df):
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
with read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader:
chunks = list(reader)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file():
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
with read_json(path, lines=True, chunksize=1) as reader:
chunked = pd.concat(reader)
unchunked = read_json(path, lines=True)
tm.assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path,
orient=None,
typ="frame",
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
date_unit=None,
encoding=None,
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
with reader:
reader.read()
assert (
reader.handles.handle.closed
), f"didn't close stream with chunksize = {chunksize}"
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as _:
pass
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
test = read_json(j, lines=True, chunksize=chunksize)
if chunksize is not None:
with test:
test = pd.concat(test)
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
def test_readjson_unicode(monkeypatch):
with tm.ensure_clean("test.json") as path:
monkeypatch.setattr("locale.getpreferredencoding", lambda l: "cp949")
with open(path, "w", encoding="utf-8") as f:
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
result = read_json(path)
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows):
# GH 33916
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = read_json(jsonl, lines=True, nrows=nrows)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(nrows, chunksize):
# GH 33916
# Test reading line-format JSON to Series with nrows and chunksize param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader:
chunked = pd.concat(reader)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)
def test_readjson_nrows_requires_lines():
# GH 33916
# Test ValuError raised if nrows is set without setting lines in read_json
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
msg = "nrows can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
read_json(jsonl, lines=False, nrows=2)
def test_readjson_lines_chunks_fileurl(datapath):
# GH 27135
# Test reading line-format JSON from file url
df_list_expected = [
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
]
os_path = datapath("io", "json", "data", "line_delimited.json")
file_url = Path(os_path).as_uri()
with read_json(file_url, lines=True, chunksize=1) as url_reader:
for index, chuck in enumerate(url_reader):
tm.assert_frame_equal(chuck, df_list_expected[index])
def test_chunksize_is_incremental():
# See https://github.com/pandas-dev/pandas/issues/34548
jsonl = (
"""{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}\n"""
* 1000
)
class MyReader:
def __init__(self, contents):
self.read_count = 0
self.stringio = StringIO(contents)
def read(self, *args):
self.read_count += 1
return self.stringio.read(*args)
def __iter__(self):
self.read_count += 1
return iter(self.stringio)
reader = MyReader(jsonl)
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
assert reader.read_count > 10