import numpy as np
import pandas as pd
import gensim.downloader as gs

glove_vectors = gs.load('glove-wiki-gigaword-200')
glove_vectors.sort_by_descending_frequency()


glove_vectors.most_similar(['science', 'math'], topn = 10)

[('mathematics', 0.768941342830658),
 ('physics', 0.6959351301193237),
 ('biology', 0.6668677926063538),
 ('teaching', 0.6357982158660889),
 ('curriculum', 0.635124921798706),
 ('chemistry', 0.6340017318725586),
 ('sciences', 0.6332299709320068),
 ('education', 0.630456268787384),
 ('graduate', 0.6268227696418762),
 ('academic', 0.6119447350502014)]


import requests

def get_ngrams(query, start_year = 1800, end_year = 2019):
    # Define Parameters for Request
    params = {
    "content": query,
    "year_start": start_year,
    "year_end": end_year
    }
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
    }
    # Make Request
    html = requests.get("https://books.google.com/ngrams/json", params=params, headers=headers, timeout=30).text

    # Clean Data
    df = pd.read_json(html)
    df = pd.DataFrame(df['timeseries'].tolist(), columns = np.arange(params['year_start'], params['year_end']+1), index = df['ngram'])
    df.index.name = "year"
    df = df.transpose()
    return df


df = get_ngrams("the Great War, World War I, World War II")
df.head()


import plotly.express as px

class SemanticHistory:
    def __init__(self, search = [], sample_n = 20, start_year = 1800, end_year = 2019):
        if type(search) == list:
            self.search = [str.lower(str(s)) for s in list(search)]
        else:
            self.search = [str.lower(str(search))]
        self.sample_n = sample_n

        if not any(" " in s for s in search):
            self.most_similar = glove_vectors.most_similar(self.search, topn = sample_n)
            words = self.search + [word[0] for word in self.most_similar]
            query = ', '.join(words)
            self.raw_data = get_ngrams(query, start_year, end_year)
        else:
            raise Exception("Input must be a single word or list of single words.")
    def plot(self, weighted = True):
        if weighted == True:
            data = self.raw_data.multiply(([1]*len(self.search) + [word[1] for word in self.most_similar]))
        else:
            data = self.raw_data
        fig = px.area(data,
                      labels = {'value':('Weighted '*weighted + 'Frequency'), 'year':'Word', 'index':'Year'},
                      template = 'plotly_white')
        fig.update_layout(xaxis_title = None,
                          yaxis_showticklabels = False,
                          legend=dict(title=None, orientation = "h"))
        fig.show()


gender_history = SemanticHistory(['gender', 'sex'])
honor_history = SemanticHistory(['honor', 'dignity'])
equality_history = SemanticHistory("equality")
eating_history = SemanticHistory("eating")
liberalarts_history = SemanticHistory(['literature', 'arts', 'poetry'])
darkness_history = SemanticHistory(['dark', 'gloomy'])


gender_history.plot(weighted = False)
honor_history.plot(weighted = False)
equality_history.plot(weighted = False)
eating_history.plot(weighted = False)
liberalarts_history.plot(weighted = False)
darkness_history.plot(weighted = False)

year	the Great War	World War I	World War II
1800	8.974028e-09	8.142913e-08	1.477712e-07
1801	7.179222e-09	6.514331e-08	1.182170e-07
1802	5.982685e-09	5.428609e-08	9.902061e-08
1803	5.479016e-09	4.653093e-08	8.487481e-08
1804	4.345168e-09	3.009015e-08	5.482785e-08

The History of Semantic Spaces

The Plan¶

Step 1: Defining Semantic Spaces¶

Step 2: Scraping¶

Step 3: Putting it Together¶