Strings Part 3
Unicode
-
We used to represent strings using the ASCII character encoding
-
ASCII uses 8 bits for every character (every bit is a zero or one)
-
For example, the letter ‘a’ is
110 0001
,b
is110 0100
-
Not enough room for other languages!
-
Python uses the more modern Unicode standard
-
Unicode uses 16 bits for every character
-
Supports all languages, even emojis
Hexidecimal
- 16 bits is a lot of 1s and 0s!
- instead we use hexadecimal notation
binary | decimal | hexidecimal |
---|---|---|
0001 | 1 | 1 |
0010 | 2 | 2 |
… | … | … |
1001 | 9 | 9 |
1010 | 10 | A |
1011 | 11 | B |
… | … | … |
1111 | 15 | F |
Unicode and hexidecimal
- for unicode, we use
\u
followed by the hex representation - for example,
à
is\u00E0
- and 🔥 is
\U0001F525
(for unicode hex > 4 digits, you need capital U and leading zeroes) - my name in Italian:
last_name = 'Zappal\u00E0'
print(last_name)
# if your keyboard supports unicode
full_name = 'Daniel Zappalà'
print(full_name)
Zappalà
Daniel Zappalà
other Unicode examples
goodbye = 'Hasta mañana'
print(goodbye)
print('This lesson is on \U0001F525')
Hasta mañana
This lesson is on 🔥
endswith and startswith
str.endswith(substring)
andstr.startswith(substring)
- returns True or False
value = 'cougars'
print(value.endswith('rs'))
print(value.endswith('x'))
print(value.startswith('co'))
print(value.startswith('cu'))
True
False
True
False
replace
str.replace(old, new)
- returns a new string with all instances of
old
replaced withnew
value = 'Utah is the right place'
new_value = value.replace('Utah', 'Oregon')
print(new_value)
# replace doesn't respect word boundaries
sentence = 'this is it'
new_sentence = sentence.replace('is', 'xxx')
print(new_sentence)
# original strings unchanged
print(value, sentence)
Oregon is the right place
thxxx xxx it
Utah is the right place this is it
split
str.split(delimiter)
- returns a list containing pieces of the string in between the delimiters
- most common delimiters - comma, space, newline
some_input = 'Emma:Smith:94'
values = some_input.split(':')
print(values)
['Emma', 'Smith', '94']
# by default, splits on whitespace
some_input = 'Emma Smith 94'
values = some_input.split()
print(values)
# can split on anything
some_input = 'rockpaperscissorsrockpaperscissorsrockpaperscissors'
values = some_input.split('paper')
print(values)
values = some_input.split('scissorsrock')
print(values)
['Emma', 'Smith', '94']
['rock', 'scissorsrock', 'scissorsrock', 'scissors']
['rockpaper', 'paper', 'paperscissors']
join
delimiter.join(list)
- returns a new string that uses takes the list and joins it into a string using the delimiter
values = ['Emma', 'Smith', '94']
result = ','.join(values)
print(result)
# can use any delimiter
result = 'hello'.join(values)
print(result)
Emma,Smith,94
EmmahelloSmithhello94
Parsing files
- often will come with values separated by spaces or commas
- common parsing tasks
- read the file line by line
- use split to separate each line into values
- compute something using these values
Simple parsing example
We have a file called sample-file.txt
that contains:
This is a note to myself
to remember to send some money to
Dominic. He could use some help
with rent.
# Simple parsing example
def mentions(filename, keyword):
with open(filename) as f:
for line in f:
words = line.split()
for word in words:
if word == keyword:
return True
return False
print(mentions('sample-file.txt', 'money'))
print(mentions('sample-file.txt', 'monkey'))
True
False
- Remember our file:
This is a note to myself
to remember to send some money to
Dominic. He could use some help
with rent.
- why does this return false?
print(mentions('sample-file.txt', 'rent'))
False
- remember we can use strip()
word = "rent."
result = word.strip('.')
print(result)
rent
# Simple parsing example -- fixed to remove punctuation from the end of words
def mentions(filename, keyword):
with open(filename) as f:
for line in f:
words = line.split()
for word in words:
word = word.strip('.?!:')
if word == keyword:
return True
return False
print(mentions('sample-file.txt', 'money'))
print(mentions('sample-file.txt', 'monkey'))
print(mentions('sample-file.txt', 'rent'))
True
False
True
Structured data
- often the files we use have structured data
- or if they are not structurd, we clean them up so they are :-)
- for example, consider a file called
grades.txt
:
Smith,Emma,HW1,100,95
White,Jacob,HW1,100,92
Fitzgerald,Brianna,HW1,100,99
Pandey,Manoj,HW2,100,93
def average_score(filename, assignment_name):
sum = 0
count = 0
average = 0
with open(filename) as file:
for line in file:
# split the line by comma
values = line.strip().split(',')
if values[2] == assignment_name:
# be sure to convert to int
sum += int(values[4])
count += 1
if count > 0:
average = sum / count
return average
average = average_score('grades.txt', 'HW1')
print(f"Average score for HW1: {average}")
Average score for HW1: 95.33333333333333
notice we can call strip() and split() one right after another
- both return strings
values = line.strip().split(',')
if values[2] == assignment:
# be sure to convert to int
sum += int(values[4])
count += 1
notice we need to use list notation for values
and we need to convert strings to ints!
unpacking
- we can unpack a list directly into variables if we know how long it is
- for example:
values = line.split(',')
- becomes:
last, first, assignment, grade = line.split(',')
- we now have four variables, one for every value in between commas
def average_score(filename, assignment_name):
sum = 0
count = 0
average = 0
with open(filename) as file:
for line in file:
# split the line by comma and unpack the list
last, first, assignment, total, score = line.strip().split(',')
if assignment == assignment_name:
# be sure to convert to int
sum += int(score)
count += 1
if count > 0:
average = sum / count
return average
average = average_score('grades.txt', 'HW1')
print(f"Average score for HW1: {average}")
Average score for HW1: 95.33333333333333
Reformatting data
- consider a file called
class-grades.txt
:
Smith,Emma,HW1,100,95
White,Jacob,HW1,100,92
Fitzgerald,Brianna,HW1,100,99
Pandey,Manoj,HW2,100,93
- maybe we would like to use a percentage instead of a raw score
def convert_to_percentage(filename):
with open(filename) as file:
for line in file:
# split the line by comma and unpack the list
last, first, assignment, total, score = line.strip().split(',')
percentage = int(score)/int(total)
my_line = ','.join([last, first, assignment, total, str(percentage)])
print(my_line)
convert_to_percentage('grades.txt')
Smith,Emma,HW1,100,0.95
White,Jacob,HW1,100,0.92
Fitzgerald,Brianna,HW1,100,0.99
Pandey,Manoj,HW2,100,0.93
Writing Files
- You can open a file for writing using familiar syntax
- Add a ‘w’ to write to the file or an ‘a’ to append to the file
- Then write to the file with
file.write()
with open ('some-file.txt', 'w') as file:
file.write('hello')
- does not add any newlines
def convert_to_percentage(filename, outfilename):
# open the input file for reading
with open(filename) as file:
# open the output file for writing
with open(outfilename, 'w') as outfile:
for line in file:
# split the line by comma and unpack the list
last, first, assignment, total, score = line.strip().split(',')
percentage = int(score)/int(total)
my_line = ','.join([last, first, assignment, total, str(percentage)])
# add a newline
my_line += '\n'
# write the line
outfile.write(my_line)
convert_to_percentage('grades.txt', 'transformed-grades.txt')
Be careful!
Any time you open a file for writing you will erase whatever was in the file previously if it already exists!