Acrostic football league tables

Following from a question to the Guardian's "The Knowledge" column I've tried to find a longer acrostic than "TABLE".

“Has there ever been a longer acrostic spelled out in a table than the oft-recurring ‘TABLE’ from the Premier League this season – or ‘LAWNS’ from League Two?” asks Marco Jackson.

To do this I'm going to fetch data on every match played in English leagues since 1994-95 and use it to generate league tables for the end of every day that a match has been played. I'll then check the first letter of every team in the league table against a dictionary of English words to find the longest.

Import needed modules

import sys
import urllib.parse
import os
import csv
import re

from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.options.display.float_format = '{:,.0f}'.format

Fetch league data

From http://www.football-data.co.uk/englandm.php - series of CSV tables with data on matches played in English leagues (Premier League, Division One/Championship, Division Two/League One, Division Three/League Two, Conference) since 1994/95.

I haven't checked the accuracy of this data against any other source. In particular, it doesn't include any data on points deductions which do happen and affect the league table.

# download the main page and extract the links to CSV files
data_url = "http://www.football-data.co.uk/englandm.php"
r = requests.get(data_url)
soup = BeautifulSoup(r.text, 'html.parser')

# go through all the links in the document
for link in soup.find_all('a'):
    href = link.get('href')
    # check for those that are CSV files
    if href.endswith(".csv"):
        with open("data/" + href.replace("/", "_"), "wb") as fd:
            data_file_url = urllib.parse.urljoin(r.url, href)
            r_data = requests.get(data_file_url, stream=True)
            # write the file to disk in the `/data` directory
            for chunk in r_data.iter_content(chunk_size=128):
                fd.write(chunk)
            print("Downloaded {}".format(data_file_url))

Fetch dictionary data

From https://github.com/dwyl/english-words/blob/master/words_alpha.txt

dictionary = requests.get('https://github.com/dwyl/english-words/blob/master/words_alpha.txt?raw=true', stream=True)
with open("data/dictionary.txt", "wb") as c:
    for chunk in dictionary.iter_content(chunk_size=128):
        c.write(chunk)

Useful functions

matches_to_table() takes a list of matches (as a pandas dataframe) and a list of teams and turns it into a league table. The optional list of teams ensures that any teams that haven't yet played are included in the table (if not provided it's worked out based on the matches. Tables are sorted according to the following criteria: points, then goal difference, then goals scored.

The matches dataframe requires the following columns for each match:

  • HomeTeam - team playing at home
  • AwayTeam - team playing away
  • FTR - full time result: "H" = Home win, "A" = Away win, "D" = Draw
  • FTHG - goals scored by full time by the home team
  • FTAG - goals scored by full time by the home team
def matches_to_table(matches, teams=None, points_for_win=3, points_for_draw=1):
    if not teams:
        teams = list(set(list(matches.HomeTeam.unique()) + list(matches.AwayTeam.unique())))

    fields = ["played", "won", "drawn", "lost", "for", "against", "gd", "points", "points_deduction"]
    table = pd.DataFrame(index=teams, columns=fields).fillna(0)
    table.loc[:, "won"]     = table.loc[:, "won"].add(    matches[matches.FTR == "H"].HomeTeam.value_counts(), fill_value=0) # Home wins
    table.loc[:, "lost"]    = table.loc[:, "lost"].add(   matches[matches.FTR == "A"].HomeTeam.value_counts(), fill_value=0) # Home losses
    table.loc[:, "drawn"]   = table.loc[:, "drawn"].add(  matches[matches.FTR == "D"].HomeTeam.value_counts(), fill_value=0) # Home draws
    table.loc[:, "won"]     = table.loc[:, "won"].add(    matches[matches.FTR == "A"].AwayTeam.value_counts(), fill_value=0) # Away wins
    table.loc[:, "lost"]    = table.loc[:, "lost"].add(   matches[matches.FTR == "H"].AwayTeam.value_counts(), fill_value=0) # Away losses
    table.loc[:, "drawn"]   = table.loc[:, "drawn"].add(  matches[matches.FTR == "D"].AwayTeam.value_counts(), fill_value=0) # Away draws
    table.loc[:, "for"]     = table.loc[:, "for"].add(    matches.groupby("HomeTeam").sum()["FTHG"]          , fill_value=0) # home for
    table.loc[:, "for"]     = table.loc[:, "for"].add(    matches.groupby("AwayTeam").sum()["FTAG"]          , fill_value=0) # away for
    table.loc[:, "against"] = table.loc[:, "against"].add(matches.groupby("HomeTeam").sum()["FTAG"]          , fill_value=0) # home against
    table.loc[:, "against"] = table.loc[:, "against"].add(matches.groupby("AwayTeam").sum()["FTHG"]          , fill_value=0) # away against
    table.loc[:, "points"]  = (table.loc[:, "won"] * points_for_win) + \
                              (table.loc[:, "drawn"] * points_for_draw) - table.loc[:, "points_deduction"]
    table.loc[:, "gd"]      = table.loc[:, "for"] - table.loc[:, "against"]
    table.loc[:, "played"]  = table.loc[:, "won"] + table.loc[:, "drawn"] + table.loc[:, "lost"]
    return table.sort_values(["points", "gd", "for"], ascending=False)

table_to_string() takes the table output from matches_to_table and turns it into a string consisting of the first letter of each of the teams in the table (in order)

def table_to_string(table):
    return "".join([t[0] for t in table.index])

get_char_ngrams() takes a string and returns a list of ngrams found within it, with the length of the ngrams found controlled by min_n and max_n.

def get_char_ngrams(input_string, min_n=4, max_n=100):
    ngrams = []
    for n in range(min_n, max_n):
        ngrams += [input_string[i:i+n] for i in range(len(input_string)-n+1)]
    return ngrams

get_division_from_filename() gets the division and year based on the filename given

def get_division_from_filename(filename):
    filename = re.split('_|\.', filename)
    year = filename[1]
    if int(year[0:2]) > 90:
        year = int(year[0:2]) + 1900
    else:
        year = int(year[0:2]) + 2000
    if year < 2004:
        division = {
            "E0": "Premier League",
            "E1": "Division One",
            "E2": "Division Two",
            "E3": "Division Three",
            "EC": "Conference"
        }[filename[2]]
    else:
        division = {
            "E0": "Premier League",
            "E1": "Championship",
            "E2": "League One",
            "E3": "League Two",
            "EC": "Conference"
        }[filename[2]]

    year = "{}-{}".format( str(year), str(year+1)[2:4])

    return (year, division)
def find_word(word, found_words, full=False):
    t = tuple([w for w in found_words if w[3]==word.lower()][0][0:2])
    print(get_division_from_filename(t[0]) + (t[1],))
    table = tables[t].reset_index().rename(columns={"index": "Team"})
    table.index = table.index + 1
    if full:
        return table
    word_location = "".join([s[0].lower() for s in list(table.Team)]).find(word)
    return table.iloc[slice(max(word_location - 1, 0), word_location + len(word) + 1, None)]

Generate tables

Here I go through each file downloaded in the /data directory, and carry out the following steps:

  1. read file into a pandas dataframe
  2. find unique dates on which matches were played
  3. go through each date and extract the matches played by that date
  4. create a table based on the matches played by that date
  5. extract the string of the first letter of each team played by that date

The strings and tables are added to arrays to make it easy to find them again.

tstrings = []
files_checked = []
tables = {}
for dfile in os.listdir("data"):
    if dfile in files_checked:
        continue
    if dfile.endswith(".csv"):
        try:
            df = pd.read_csv("data/{}".format(dfile)) # step 1
#             print(dfile)
        except pd.errors.ParserError:
            print(dfile + " [SKIPPED]")
            continue
        df.loc[:, "Date"] = pd.to_datetime(df.Date, format="%d/%m/%y")
        df.dropna(inplace=True, how="all")
        teams = list(set(list(df.HomeTeam.unique()) + list(df.AwayTeam.unique())))
        for i in df.Date.unique(): # step 2
            played_matches = df[df.Date <= i] # step 3
            table = matches_to_table(played_matches, teams) # step 4
            tables[(dfile, str(i)[0:10])] = table
            tstring = table_to_string(table) # step 5
            tstrings.append((dfile, str(i)[0:10], tstring))
        files_checked.append(dfile)
#             print("{} - {} matches played by {}".format(tstring, len(played_matches), i))
# save the strings to a CSV file for later use
with open("tablestrings.csv", "w", newline='') as b:
    writer = csv.writer(b)
    writer.writerow(["file", "date", "tstring"])
    for t in tstrings:
        writer.writerow(t)
"{:,} files checked (each file contains matches for one year for one division)".format(len(files_checked))
'113 files checked (each file contains matches for one year for one division)'
"{:,} table strings generated. Example string: {}".format(len(tstrings), tstrings[0][2])
'10,268 table strings generated. Example string: CCMTLLSSDALNMABWICEM'

Import dictionary data

dictionary = []
with open("data/dictionary.txt", "r") as c:
    dictionary = c.readlines()
dictionary = set([d.strip().lower() for d in dictionary if len(d.strip()) >= 5])

Find long words in the table strings

Split each iteration of the league table into ngrams and check whether it is a word found in the dictionary.

found_words = []
for k, t in enumerate(tstrings):
    ngrams = get_char_ngrams(t[2].lower(), min_n=5)
    ngrams = set(ngrams)
    found = ngrams.intersection(dictionary)
    for f in found:
        found_words.append(list(t) + [f])
    if k % 1000 == 0:
        print("Iteration {}, found {}".format(k, len(found_words)))
print("Iteration {}, found {} [COMPLETE]".format(k, len(found_words)))
Iteration 0, found 0
Iteration 1000, found 73
Iteration 2000, found 109
Iteration 3000, found 146
Iteration 4000, found 195
Iteration 5000, found 242
Iteration 6000, found 330
Iteration 7000, found 358
Iteration 8000, found 411
Iteration 9000, found 450
Iteration 10000, found 478
Iteration 10267, found 481 [COMPLETE]

Words with six or more letters

[list(get_division_from_filename(w[0])) + [w[1], w[3], len(w[3])] for w in found_words if len(w[3])>5]
[['2000-01', 'Premier League', '2000-12-17', 'maills', 6],
 ['2000-01', 'Premier League', '2000-12-18', 'maills', 6],
 ['2000-01', 'Premier League', '2000-12-22', 'maills', 6],
 ['2000-01', 'Premier League', '2001-01-30', 'maslin', 6],
 ['2000-01', 'Premier League', '2001-04-02', 'maills', 6],
 ['2000-01', 'Premier League', '2001-04-04', 'maills', 6],
 ['2000-01', 'Premier League', '2001-04-10', 'maills', 6],
 ['2000-01', 'Premier League', '2001-04-11', 'maills', 6],
 ['2000-01', 'Premier League', '2001-04-30', 'scants', 6],
 ['2000-01', 'Premier League', '2001-05-01', 'scants', 6],
 ['2001-02', 'Division Three', '2001-10-27', 'blokes', 6],
 ['2001-02', 'Division Three', '2001-10-28', 'blokes', 6],
 ['2006-07', 'Premier League', '2006-08-22', 'rental', 6],
 ['2006-07', 'Premier League', '2007-03-14', 'albert', 6],
 ['2006-07', 'League One', '2006-12-30', 'snobby', 6],
 ['2008-09', 'League One', '2009-01-27', 'smolts', 6],
 ['2008-09', 'League Two', '2009-01-03', 'begall', 6],
 ['2010-11', 'League Two', '2010-09-11', 'cobcab', 6],
 ['2011-12', 'League Two', '2011-08-20', 'monasa', 6],
 ['2012-13', 'Conference', '2013-04-09', 'chants', 6],
 ['2013-14', 'Premier League', '2014-02-08', 'camlet', 6],
 ['2013-14', 'Premier League', '2014-03-08', 'amtmen', 6],
 ['2014-15', 'League Two', '2015-02-24', 'peacod', 6],
 ['2015-16', 'League Two', '2015-09-12', 'cancha', 6],
 ['2015-16', 'League Two', '2015-10-21', 'blanch', 6],
 ['1993-94', 'Premier League', '1994-05-07', 'tmesis', 6],
 ['1993-94', 'Premier League', '1994-05-08', 'tmesis', 6],
 ['1996-97', 'Premier League', '1997-02-15', 'wastel', 6],
 ['1998-99', 'Division Three', '1998-12-18', 'cherts', 6],
 ['1999-00', 'Premier League', '2000-03-05', 'calesa', 6]]

Words with five letters

five_letters = [list(get_division_from_filename(w[0])) + [w[1], w[3], len(w[3])] for w in found_words if len(w[3])==5]
print(len(five_letters))
# five_letters
451
", ".join(sorted(set([w[3] for w in five_letters])))
'abama, abamp, accel, alans, allan, allis, amahs, ambos, amlet, amsel, ancha, ancle, ankhs, anlas, appal, apter, ascan, astel, atlas, awacs, baccy, baled, banal, barat, barms, basan, bassa, beant, becco, belam, belch, betas, bibbs, blams, blanc, blens, bless, blobs, bloke, blows, bolty, brent, cabda, cable, caleb, camel, camla, canch, canis, canos, cants, capsa, casts, catel, celts, chant, chats, cheap, cheek, chert, chics, claes, clans, clape, clast, cleam, clefs, clomb, clyde, cobby, coman, comby, comdt, compd, copes, copps, copra, cordy, corms, cotes, crabs, crost, dampy, dobra, dwarf, egall, elamp, ental, etwas, fable, facet, fawns, feces, forbs, fpsps, gasts, gecks, ghast, gotra, hants, herls, hewgh, hoers, hyleg, kohls, laban, laics, lamas, lames, lanas, lanch, lansa, lawns, leban, lessn, liwan, llama, macan, maced, maces, macle, madge, maill, mails, malam, masts, mated, mates, meach, melds, merak, metal, mewls, molts, monas, mosso, nabal, nable, nambe, nants, nasab, nasch, neaps, nobby, norit, orang, parma, plebs, plote, ponds, praam, prate, proms, pyche, regal, resaw, rests, rotas, rybat, sabes, samba, sawed, scale, scams, scant, scase, sceat, scobs, scrob, segos, sensa, shawn, sherd, slabs, slaws, smash, smolt, snows, snowy, stale, stops, stoss, swail, swash, swats, swing, swobs, swots, syces, table, taces, taels, tamas, teals, thaws, tramp, twaes, twale, wants, wasco, wasnt, waste, whang, wises, worms'

Words in context

Display the league table centered around the teams making the words.

find_word("blokes", found_words)
('2001-02', 'Division Three', '2001-10-27')
Team played won drawn lost for against gd points points_deduction
14 Darlington 15 5 4 6 19 17 2 19 0
15 Bristol Rvs 14 5 4 5 13 15 -2 19 0
16 Lincoln 16 4 6 6 16 17 -1 18 0
17 Oxford 16 4 6 6 15 16 -1 18 0
18 Kidderminster 16 4 5 7 10 17 -7 17 0
19 Exeter 16 4 5 7 18 31 -13 17 0
20 Swansea 16 4 4 8 22 29 -7 16 0
21 Hartlepool 15 4 3 8 14 16 -2 15 0
find_word("chants", found_words)
('2012-13', 'Conference', '2013-04-09')
Team played won drawn lost for against gd points points_deduction
12 Braintree Town 42 17 7 18 59 70 -11 58 0
13 Cambridge 44 14 14 16 65 66 -1 56 0
14 Hyde United 44 16 7 21 61 67 -6 55 0
15 Alfreton Town 43 14 12 17 64 71 -7 54 0
16 Nuneaton Town 44 12 15 17 51 62 -11 51 0
17 Tamworth 42 14 9 19 51 62 -11 51 0
18 Southport 44 13 12 19 70 83 -13 51 0
19 Gateshead 43 12 14 17 54 59 -5 50 0
find_word("snobby", found_words)
('2006-07', 'League One', '2006-12-30')
Team played won drawn lost for against gd points points_deduction
1 Scunthorpe 25 14 6 5 38 20 18 48 0
2 Nott'm Forest 25 14 6 5 35 20 15 48 0
3 Oldham 25 13 6 6 40 21 19 45 0
4 Bristol City 25 13 6 6 34 24 10 45 0
5 Blackpool 25 11 8 6 40 27 13 41 0
6 Yeovil 24 11 8 5 31 22 9 41 0
7 Swansea 25 11 6 8 40 29 11 39 0
find_word("albert", found_words)
('2006-07', 'Premier League', '2007-03-14')
Team played won drawn lost for against gd points points_deduction
2 Chelsea 29 20 6 3 51 19 32 66 0
3 Arsenal 28 16 7 5 51 23 28 55 0
4 Liverpool 29 16 5 8 44 20 24 53 0
5 Bolton 29 14 5 10 34 34 0 47 0
6 Everton 29 11 10 8 37 26 11 43 0
7 Reading 29 13 4 12 43 38 5 43 0
8 Tottenham 29 12 6 11 40 43 -3 42 0
9 Portsmouth 29 11 8 10 36 31 5 41 0
find_word("table", found_words)
('2017-18', 'Premier League', '2017-12-23')
Team played won drawn lost for against gd points points_deduction
4 Liverpool 19 9 8 2 41 23 18 35 0
5 Tottenham 19 10 4 5 34 18 16 34 0
6 Arsenal 19 10 4 5 34 23 11 34 0
7 Burnley 19 9 5 5 16 15 1 32 0
8 Leicester 19 7 6 6 29 28 1 27 0
9 Everton 19 7 5 7 24 30 -6 26 0
10 Watford 19 6 4 9 27 34 -7 22 0

Teams in alphabetical order

I also wanted to look for the table in which a run of teams were in alphabetical order. I've ignored tables in August and September when the league table is less developed and so a default alphabetical team list is likely to take place.

alpha_tables = []
for i in tables:
    division = get_division_from_filename(i[0])
    team_order = list(tables[i].index)
    ngrams = get_char_ngrams(team_order, min_n=5)
    for n in ngrams:
        if n == sorted(n, reverse=True):
            alpha_tables.append([i, division, n])
gt_seven_teams = [a for a in alpha_tables if len(a[2])>7 and a[2]==sorted(a[2])]
gt_nine_teams = [a for a in alpha_tables if len(a[2])>9]
print(gt_seven_teams[0][1][0], gt_seven_teams[0][1][1], gt_seven_teams[0][0][1])
print(gt_seven_teams[0][2])
t = tables[gt_seven_teams[0][0]].reset_index().rename(columns={"index": "Team"})
t.index = t.index + 1
t
2004-05 League One 2004-11-09
['Bournemouth', 'Bradford', 'Brentford', 'Bristol City', 'Chesterfield', 'Hartlepool', 'Huddersfield', 'Swindon']
Team played won drawn lost for against gd points points_deduction
1 Luton 17 12 2 3 34 18 16 38 0
2 Hull 17 10 2 5 28 22 6 32 0
3 Tranmere 17 9 5 3 26 20 6 32 0
4 Bournemouth 17 9 3 5 31 21 10 30 0
5 Bradford 17 9 2 6 28 25 3 29 0
6 Brentford 17 8 4 5 21 22 -1 28 0
7 Bristol City 17 7 6 4 34 23 11 27 0
8 Chesterfield 17 7 5 5 23 17 6 26 0
9 Hartlepool 17 8 2 7 26 28 -2 26 0
10 Huddersfield 17 7 4 6 28 23 5 25 0
11 Swindon 17 7 4 6 28 25 3 25 0
12 Sheffield Weds 17 6 6 5 25 23 2 24 0
13 Port Vale 17 7 2 8 20 24 -4 23 0
14 Doncaster 17 6 5 6 21 26 -5 23 0
15 Colchester 17 6 4 7 26 22 4 22 0
16 Wrexham 16 5 6 5 18 23 -5 21 0
17 Barnsley 17 4 6 7 21 23 -2 18 0
18 Blackpool 17 4 6 7 21 25 -4 18 0
19 Walsall 17 4 6 7 24 30 -6 18 0
20 Peterboro 17 4 5 8 19 19 0 17 0
21 Oldham 17 4 4 9 21 27 -6 16 0
22 Torquay 16 3 6 7 20 27 -7 15 0
23 Milton Keynes Dons 17 3 5 9 20 31 -11 14 0
24 Stockport 17 2 4 11 16 35 -19 10 0
print(gt_seven_teams[1][1][0], gt_seven_teams[1][1][1], gt_seven_teams[1][0][1])
print(gt_seven_teams[1][2])
t = tables[gt_seven_teams[1][0]].reset_index().rename(columns={"index": "Team"})
t.index = t.index + 1
oteams = t[t.Team.isin(gt_seven_teams[1][2])].index
t.iloc[slice(max(oteams[0] - 2, 0), oteams[-1] + 1, None), :]
2011-12 Premier League 2011-11-27
['Arsenal', 'Aston Villa', 'Everton', 'Norwich', 'QPR', 'Stoke', 'Swansea', 'West Brom']
Team played won drawn lost for against gd points points_deduction
6 Liverpool 13 6 5 2 17 12 5 23 0
7 Arsenal 13 7 2 4 26 23 3 23 0
8 Aston Villa 13 3 7 3 16 17 -1 16 0
9 Everton 12 5 1 6 15 16 -1 16 0
10 Norwich 13 4 4 5 19 21 -2 16 0
11 QPR 13 4 3 6 14 24 -10 15 0
12 Stoke 13 4 3 6 13 23 -10 15 0
13 Swansea 13 3 5 5 12 16 -4 14 0
14 West Brom 13 4 2 7 12 20 -8 14 0
15 Fulham 13 2 6 5 15 16 -1 12 0
print(gt_nine_teams[0][1][0], gt_nine_teams[0][1][1], gt_nine_teams[0][0][1])
print(gt_nine_teams[0][2])
t = tables[gt_nine_teams[0][0]].reset_index().rename(columns={"index": "Team"})
t.index = t.index + 1
oteams = t[t.Team.isin(gt_nine_teams[0][2])].index
t.iloc[slice(max(oteams[0] - 2, 0), oteams[-1] + 1, None), :]
2011-12 Conference 2012-03-31
['York', 'Southport', 'Luton', 'Kidderminster', 'Grimsby', 'Gateshead', 'Forest Green', 'Cambridge', 'Braintree Town', 'Barrow']
Team played won drawn lost for against gd points points_deduction
3 Mansfield 41 20 14 7 74 46 28 74 0
4 York 39 19 13 7 74 41 33 70 0
5 Southport 41 20 10 11 63 60 3 70 0
6 Luton 39 18 13 8 67 37 30 67 0
7 Kidderminster 41 19 10 12 71 55 16 67 0
8 Grimsby 41 18 10 13 75 57 18 64 0
9 Gateshead 40 18 10 12 61 56 5 64 0
10 Forest Green 41 16 13 12 59 42 17 61 0
11 Cambridge 39 15 12 12 47 37 10 57 0
12 Braintree Town 41 16 9 16 70 71 -1 57 0
13 Barrow 41 16 7 18 55 67 -12 55 0
14 Ebbsfleet 40 13 10 17 59 69 -10 49 0