Working with Tables

import matplotlib.pyplot as plt

def read_csv(filename):
    with open(filename, 'rt') as file:
        # Skip header
        next(file)

        # Read lines
        lines = []
        for line in file:
            lines.append(line.strip().split(','))

    return lines

lines = read_csv("world-population.csv")
lines[:2]

    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]

import csv

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)

        # Skip header
        next(reader)

        lines = []
        for line in reader:
            lines.append(line)

        return lines

import csv

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)

        # Skip header
        next(reader)

        return list(reader)

lines = read_csv("world-population.csv")
lines[:2]

    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]

What if we want to return the header instead of skip it?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        header = next(reader)
        lines = list(reader)
        return header, lines

header, lines = read_csv("world-population.csv")
header

    ['LocID',
     'Location',
     'VarID',
     'Variant',
     'Time',
     'MidPeriod',
     'PopMale',
     'PopFemale',
     'PopTotal',
     'PopDensity']

lines[:2]

    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        return next(reader), list(reader)

header, lines = read_csv("world-population.csv")
header

    ['LocID',
     'Location',
     'VarID',
     'Variant',
     'Time',
     'MidPeriod',
     'PopMale',
     'PopFemale',
     'PopTotal',
     'PopDensity']

lines[:2]

    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]

for line in lines[:3]:
    for name, value in zip(header, line):
        print(f"{name}: {value}")
    print()

    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1950
    MidPeriod: 1950.5
    PopMale: 1266259.556
    PopFemale: 1270171.462
    PopTotal: 2536431.018
    PopDensity: 19.497

    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1951
    MidPeriod: 1951.5
    PopMale: 1290237.638
    PopFemale: 1293796.589
    PopTotal: 2584034.227
    PopDensity: 19.863

    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1952
    MidPeriod: 1952.5
    PopMale: 1313854.565
    PopFemale: 1317007.125
    PopTotal: 2630861.69
    PopDensity: 20.223

for letter, number, word in zip(['a', 'b', 'c'], ['1', '2', '3','4'], ['this', 'is', 'cool']):
    print(letter, number, word)

print(list(zip('abc', '123')))

print(list(range(10)))

    a 1 this
    b 2 is
    c 3 cool
    [('a', '1'), ('b', '2'), ('c', '3')]
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

for letter, number in zip('abc', '123'):
    print(letter, number)

Joining the header and the values together seems useful.

Can we write a function that does that for us?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        header = next(reader)
        records = []
        for line in reader:
            record = {}
            for name, value in zip(header, line):
                record[name] = value
            records.append(record)

    return records

records = read_csv("world-population.csv")
records[:2]

    [{'LocID': '900',
      'Location': 'World',
      'VarID': '2',
      'Variant': 'Medium',
      'Time': '1950',
      'MidPeriod': '1950.5',
      'PopMale': '1266259.556',
      'PopFemale': '1270171.462',
      'PopTotal': '2536431.018',
      'PopDensity': '19.497'},
     {'LocID': '900',
      'Location': 'World',
      'VarID': '2',
      'Variant': 'Medium',
      'Time': '1951',
      'MidPeriod': '1951.5',
      'PopMale': '1290237.638',
      'PopFemale': '1293796.589',
      'PopTotal': '2584034.227',
      'PopDensity': '19.863'}]

records[8]

    {'LocID': '900',
     'Location': 'World',
     'VarID': '2',
     'Variant': 'Medium',
     'Time': '1958',
     'MidPeriod': '1958.5',
     'PopMale': '1462864.322',
     'PopFemale': '1462822.358',
     'PopTotal': '2925686.68',
     'PopDensity': '22.489'}

Looking up all the data for single entry is easy.

How would I pull out all the values for a specific column?

def select_values(records, name):
    values = []
    for record in records:
        values.append(record[name])
    return values

total_pop = select_values(records, "PopTotal")
total_pop[:5]

    ['2536431.018', '2584034.227', '2630861.69', '2677609.061', '2724846.754']

years = select_values(records, 'Time')
years[:5]

    ['1950', '1951', '1952', '1953', '1954']

plt.plot(years, total_pop)
plt.title("Total Population")

    Text(0.5, 1.0, 'Total Population')

png

🤨

plt.plot(['a', 'b', 'c', 'd'], ['x', 'y', 'z', 'x'])

    [<matplotlib.lines.Line2D at 0x11e7845b0>]

png

While that feature is cool, that’s not quite what we wanted in our scenario.

We want to convert the string values into integers and floating-point numbers.

def select_values_as_int(records, name):
    values = []
    for record in records:
        values.append(int(record[name]))
    return values

def select_values_as_float(records, name):
    values = []
    for record in records:
        values.append(float(record[name]))
    return values

def select_values(records, name, transform):
    values = []
    for record in records:
        values.append(transform(record[name]))
    return values

total_pop = select_values(records, 'PopTotal', float)
years = select_values(records, 'Time', int)

plt.plot(years, total_pop)
plt.title("Total Population")

    Text(0.5, 1.0, 'Total Population')

png

records = read_csv("world-population.csv")

total_pop = select_values(records, 'PopTotal', float)
years = select_values(records, 'Time', int)

plt.plot(years, total_pop)
plt.title("Total Population")

    Text(0.5, 1.0, 'Total Population')

png

We call this approach the record model.

We have a list of dictionaries, each dictionary represents a row in the table.

Now, there is another way to parse a CSV.

What if, instead of getting a list of dictionaries (i.e. rows), we got a dictionary of lists (i.e. columns)?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        data = {}

        header = next(reader)
        for name in header:
            data[name] = []

        for line in reader:
            for name, value in zip(header, line):
                data[name].append(value)

    return data

data = read_csv("world-population.csv")

for name, values in data.items():
    print(f"{name}: {values[:5]}")

    LocID: ['900', '900', '900', '900', '900']
    Location: ['World', 'World', 'World', 'World', 'World']
    VarID: ['2', '2', '2', '2', '2']
    Variant: ['Medium', 'Medium', 'Medium', 'Medium', 'Medium']
    Time: ['1950', '1951', '1952', '1953', '1954']
    MidPeriod: ['1950.5', '1951.5', '1952.5', '1953.5', '1954.5']
    PopMale: ['1266259.556', '1290237.638', '1313854.565', '1337452.786', '1361313.834']
    PopFemale: ['1270171.462', '1293796.589', '1317007.125', '1340156.275', '1363532.92']
    PopTotal: ['2536431.018', '2584034.227', '2630861.69', '2677609.061', '2724846.754']
    PopDensity: ['19.497', '19.863', '20.223', '20.582', '20.945']

def read_csv(filename, transforms):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        data = {}

        header = next(reader)
        for name in header:
            data[name] = []

        for line in reader:
            for name, value in zip(header, line):
                transform = str
                if name in transforms:
                    transform = transforms[name]

                data[name].append(transform(value))

    return data

transforms = {
    'PopTotal': float,
    'Time': int,
    'PopMale': float,
    'PopFemale': float,
    'PopDensity': float
}
data = read_csv("world-population.csv", transforms)

for name, values in data.items():
    print(f"{name}: {values[:5]}")

    LocID: ['900', '900', '900', '900', '900']
    Location: ['World', 'World', 'World', 'World', 'World']
    VarID: ['2', '2', '2', '2', '2']
    Variant: ['Medium', 'Medium', 'Medium', 'Medium', 'Medium']
    Time: [1950, 1951, 1952, 1953, 1954]
    MidPeriod: ['1950.5', '1951.5', '1952.5', '1953.5', '1954.5']
    PopMale: [1266259.556, 1290237.638, 1313854.565, 1337452.786, 1361313.834]
    PopFemale: [1270171.462, 1293796.589, 1317007.125, 1340156.275, 1363532.92]
    PopTotal: [2536431.018, 2584034.227, 2630861.69, 2677609.061, 2724846.754]
    PopDensity: [19.497, 19.863, 20.223, 20.582, 20.945]

plt.plot(data['Time'], data['PopTotal'])
plt.title("Total Population")

    Text(0.5, 1.0, 'Total Population')

png

transforms = {
    'PopTotal': float,
    'Time': int,
    'PopMale': float,
    'PopFemale': float,
    'PopDensity': float
}
data = read_csv("world-population.csv", transforms)

plt.plot(data['Time'], data['PopTotal'])
plt.title("Total Population")

    Text(0.5, 1.0, 'Total Population')

png

Now, getting whole columns is really easy.

How can I get individual records?

def get_record(data, index):
    record = {}
    for name, values in data.items():
        record[name] = values[index]
    return record

get_record(data, 8)

    {'LocID': '900',
     'Location': 'World',
     'VarID': '2',
     'Variant': 'Medium',
     'Time': 1958,
     'MidPeriod': '1958.5',
     'PopMale': 1462864.322,
     'PopFemale': 1462822.358,
     'PopTotal': 2925686.68,
     'PopDensity': 22.489}

We call this the dataframe model.

We have a dictionary of lists, each list representing a column in the table.

Key Ideas

When reading CSVs (or other tabular data), we will typically use either the record model or the dataframe model.
- The record model is to store the data as a list of dictionaries, each dictionary representing a row or record.
- The dataframe model is to store the data as a dictionary of lists, each list representing a column or series.
zip is pretty handy
We can pass functions as arguments to other functions!

What did you learn today?

What styles were more intuitive to you?

What approaches seem more useful to you? Why?