BYU logo Computer Science

Working with Tables

import matplotlib.pyplot as plt
def read_csv(filename):
    with open(filename, 'rt') as file:
        # Skip header
        next(file)

        # Read lines
        lines = []
        for line in file:
            lines.append(line.strip().split(','))

    return lines
lines = read_csv("world-population.csv")
lines[:2]
    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]
import csv

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)

        # Skip header
        next(reader)

        lines = []
        for line in reader:
            lines.append(line)

        return lines
import csv

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)

        # Skip header
        next(reader)

        return list(reader)
lines = read_csv("world-population.csv")
lines[:2]
    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]

What if we want to return the header instead of skip it?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        header = next(reader)
        lines = list(reader)
        return header, lines
header, lines = read_csv("world-population.csv")
header
    ['LocID',
     'Location',
     'VarID',
     'Variant',
     'Time',
     'MidPeriod',
     'PopMale',
     'PopFemale',
     'PopTotal',
     'PopDensity']
lines[:2]
    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]
def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        return next(reader), list(reader)
header, lines = read_csv("world-population.csv")
header
    ['LocID',
     'Location',
     'VarID',
     'Variant',
     'Time',
     'MidPeriod',
     'PopMale',
     'PopFemale',
     'PopTotal',
     'PopDensity']
lines[:2]
    [['900',
      'World',
      '2',
      'Medium',
      '1950',
      '1950.5',
      '1266259.556',
      '1270171.462',
      '2536431.018',
      '19.497'],
     ['900',
      'World',
      '2',
      'Medium',
      '1951',
      '1951.5',
      '1290237.638',
      '1293796.589',
      '2584034.227',
      '19.863']]
for line in lines[:3]:
    for name, value in zip(header, line):
        print(f"{name}: {value}")
    print()
    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1950
    MidPeriod: 1950.5
    PopMale: 1266259.556
    PopFemale: 1270171.462
    PopTotal: 2536431.018
    PopDensity: 19.497

    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1951
    MidPeriod: 1951.5
    PopMale: 1290237.638
    PopFemale: 1293796.589
    PopTotal: 2584034.227
    PopDensity: 19.863

    LocID: 900
    Location: World
    VarID: 2
    Variant: Medium
    Time: 1952
    MidPeriod: 1952.5
    PopMale: 1313854.565
    PopFemale: 1317007.125
    PopTotal: 2630861.69
    PopDensity: 20.223
for letter, number, word in zip(['a', 'b', 'c'], ['1', '2', '3','4'], ['this', 'is', 'cool']):
    print(letter, number, word)

print(list(zip('abc', '123')))

print(list(range(10)))
    a 1 this
    b 2 is
    c 3 cool
    [('a', '1'), ('b', '2'), ('c', '3')]
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
for letter, number in zip('abc', '123'):
    print(letter, number)

Joining the header and the values together seems useful.

Can we write a function that does that for us?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        header = next(reader)
        records = []
        for line in reader:
            record = {}
            for name, value in zip(header, line):
                record[name] = value
            records.append(record)

    return records
records = read_csv("world-population.csv")
records[:2]
    [{'LocID': '900',
      'Location': 'World',
      'VarID': '2',
      'Variant': 'Medium',
      'Time': '1950',
      'MidPeriod': '1950.5',
      'PopMale': '1266259.556',
      'PopFemale': '1270171.462',
      'PopTotal': '2536431.018',
      'PopDensity': '19.497'},
     {'LocID': '900',
      'Location': 'World',
      'VarID': '2',
      'Variant': 'Medium',
      'Time': '1951',
      'MidPeriod': '1951.5',
      'PopMale': '1290237.638',
      'PopFemale': '1293796.589',
      'PopTotal': '2584034.227',
      'PopDensity': '19.863'}]
records[8]
    {'LocID': '900',
     'Location': 'World',
     'VarID': '2',
     'Variant': 'Medium',
     'Time': '1958',
     'MidPeriod': '1958.5',
     'PopMale': '1462864.322',
     'PopFemale': '1462822.358',
     'PopTotal': '2925686.68',
     'PopDensity': '22.489'}

Looking up all the data for single entry is easy.

How would I pull out all the values for a specific column?

def select_values(records, name):
    values = []
    for record in records:
        values.append(record[name])
    return values
total_pop = select_values(records, "PopTotal")
total_pop[:5]
    ['2536431.018', '2584034.227', '2630861.69', '2677609.061', '2724846.754']
years = select_values(records, 'Time')
years[:5]
    ['1950', '1951', '1952', '1953', '1954']
plt.plot(years, total_pop)
plt.title("Total Population")
    Text(0.5, 1.0, 'Total Population')

png

🤨

plt.plot(['a', 'b', 'c', 'd'], ['x', 'y', 'z', 'x'])
    [<matplotlib.lines.Line2D at 0x11e7845b0>]

png

While that feature is cool, that’s not quite what we wanted in our scenario.

We want to convert the string values into integers and floating-point numbers.

def select_values_as_int(records, name):
    values = []
    for record in records:
        values.append(int(record[name]))
    return values
def select_values_as_float(records, name):
    values = []
    for record in records:
        values.append(float(record[name]))
    return values
def select_values(records, name, transform):
    values = []
    for record in records:
        values.append(transform(record[name]))
    return values
total_pop = select_values(records, 'PopTotal', float)
years = select_values(records, 'Time', int)
plt.plot(years, total_pop)
plt.title("Total Population")
    Text(0.5, 1.0, 'Total Population')

png

records = read_csv("world-population.csv")

total_pop = select_values(records, 'PopTotal', float)
years = select_values(records, 'Time', int)

plt.plot(years, total_pop)
plt.title("Total Population")
    Text(0.5, 1.0, 'Total Population')

png

We call this approach the record model.

We have a list of dictionaries, each dictionary represents a row in the table.

Now, there is another way to parse a CSV.

What if, instead of getting a list of dictionaries (i.e. rows), we got a dictionary of lists (i.e. columns)?

def read_csv(filename):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        data = {}

        header = next(reader)
        for name in header:
            data[name] = []

        for line in reader:
            for name, value in zip(header, line):
                data[name].append(value)

    return data
data = read_csv("world-population.csv")
for name, values in data.items():
    print(f"{name}: {values[:5]}")
    LocID: ['900', '900', '900', '900', '900']
    Location: ['World', 'World', 'World', 'World', 'World']
    VarID: ['2', '2', '2', '2', '2']
    Variant: ['Medium', 'Medium', 'Medium', 'Medium', 'Medium']
    Time: ['1950', '1951', '1952', '1953', '1954']
    MidPeriod: ['1950.5', '1951.5', '1952.5', '1953.5', '1954.5']
    PopMale: ['1266259.556', '1290237.638', '1313854.565', '1337452.786', '1361313.834']
    PopFemale: ['1270171.462', '1293796.589', '1317007.125', '1340156.275', '1363532.92']
    PopTotal: ['2536431.018', '2584034.227', '2630861.69', '2677609.061', '2724846.754']
    PopDensity: ['19.497', '19.863', '20.223', '20.582', '20.945']
def read_csv(filename, transforms):
    with open(filename, 'rt') as file:
        reader = csv.reader(file)
        data = {}

        header = next(reader)
        for name in header:
            data[name] = []

        for line in reader:
            for name, value in zip(header, line):
                transform = str
                if name in transforms:
                    transform = transforms[name]

                data[name].append(transform(value))

    return data
transforms = {
    'PopTotal': float,
    'Time': int,
    'PopMale': float,
    'PopFemale': float,
    'PopDensity': float
}
data = read_csv("world-population.csv", transforms)
for name, values in data.items():
    print(f"{name}: {values[:5]}")
    LocID: ['900', '900', '900', '900', '900']
    Location: ['World', 'World', 'World', 'World', 'World']
    VarID: ['2', '2', '2', '2', '2']
    Variant: ['Medium', 'Medium', 'Medium', 'Medium', 'Medium']
    Time: [1950, 1951, 1952, 1953, 1954]
    MidPeriod: ['1950.5', '1951.5', '1952.5', '1953.5', '1954.5']
    PopMale: [1266259.556, 1290237.638, 1313854.565, 1337452.786, 1361313.834]
    PopFemale: [1270171.462, 1293796.589, 1317007.125, 1340156.275, 1363532.92]
    PopTotal: [2536431.018, 2584034.227, 2630861.69, 2677609.061, 2724846.754]
    PopDensity: [19.497, 19.863, 20.223, 20.582, 20.945]
plt.plot(data['Time'], data['PopTotal'])
plt.title("Total Population")
    Text(0.5, 1.0, 'Total Population')

png

transforms = {
    'PopTotal': float,
    'Time': int,
    'PopMale': float,
    'PopFemale': float,
    'PopDensity': float
}
data = read_csv("world-population.csv", transforms)

plt.plot(data['Time'], data['PopTotal'])
plt.title("Total Population")
    Text(0.5, 1.0, 'Total Population')

png

Now, getting whole columns is really easy.

How can I get individual records?

def get_record(data, index):
    record = {}
    for name, values in data.items():
        record[name] = values[index]
    return record
get_record(data, 8)
    {'LocID': '900',
     'Location': 'World',
     'VarID': '2',
     'Variant': 'Medium',
     'Time': 1958,
     'MidPeriod': '1958.5',
     'PopMale': 1462864.322,
     'PopFemale': 1462822.358,
     'PopTotal': 2925686.68,
     'PopDensity': 22.489}

We call this the dataframe model.

We have a dictionary of lists, each list representing a column in the table.

Key Ideas

  • When reading CSVs (or other tabular data), we will typically use either the record model or the dataframe model.
    • The record model is to store the data as a list of dictionaries, each dictionary representing a row or record.
    • The dataframe model is to store the data as a dictionary of lists, each list representing a column or series.
  • zip is pretty handy
  • We can pass functions as arguments to other functions!

What did you learn today?

What styles were more intuitive to you?

What approaches seem more useful to you? Why?