import pandas as pd
import requests
import os
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tweepy
import configparser
from tweepy import OAuthHandler
from timeit import default_timer as timer

%matplotlib inline
%autosave 60

Autosaving every 60 seconds


                
                  archives_df = pd.read_csv('twitter-archive-enhanced.csv')


                
                  #Make a new folder to hold downloaded image-predictions tsv file

folder = 'Predictions'
if not os.path.exists(folder):
    os.makedirs(folder)


                
                  #Download file using the requests library 

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)


                
                  #write the contents of the response into a file in the Predictions folder

with open(os.path.join(folder, url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)


                
                  #lists all the folders/lists of specified directory

os.listdir('Predictions')

['image-predictions.tsv']


                
                  #Read Data from the Image Predictions file into a dataframe

predictions_df = pd.read_csv('Predictions\image-predictions.tsv', sep='\t')


                
                  #use configparser module to read API keys

credentials = configparser.ConfigParser()
credentials.read('config.ini')

['config.ini']


                
                  # Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions

consumer_key = credentials['WeRateDogs']['APIKey']
consumer_secret = credentials['WeRateDogs']['API_Key_Secret']
access_token = credentials['WeRateDogs']['ACCESS_Token']
access_secret = credentials['WeRateDogs']['ACCESS_Token_Secret']

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)



# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = archives_df.tweet_id.values
len(tweet_ids)
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
            print(str(count) + ": " + str(tweet_id))
        except tweepy.errors.Forbidden as e:
        #Silently handle the expected HTTP ERROR 403: UNAUTHORIZED_CLIENT_APPLICATION authorization error:
            print(e)
            print('\n')
            break
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print('{:.4f} secs'.format(end - start))

#Checks if dict is populated, if yes,  prints contents
if fails_dict:
    print(fails_dict)

403 Forbidden
453 - You currently have Essential access which includes access to Twitter API v2 endpoints only. If you need access to this endpoint, you’ll need to apply for Elevated access via the Developer Portal. You can learn more here: https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api#v2-access-leve


0.7197 secs


                
                  #Create an empty list that will be used to hold the dictionary objects read from the Json txt file

json_list = []


                
                  '''
 Use glob module to loop through and find text files in your working directory
 Employ a for loop together with .readline() to read lines in the text file
 Parse the String returned by the readline method into a Python dictionary with the json.loads method
 Create a python dictionary that will be loaded into the empty Json_list list
 Confirm if dictionary is present in list, if not append to list, else skip. This will ensure we dont append duplicates
'''

for tweet_file in glob.glob('*.txt'):
    with open(tweet_file, encoding='utf-8') as file:
        for line in file:
            txt_obj = file.readline()
            tweet_obj = json.loads(txt_obj)
            tweet_dict = {'tweet_id' : tweet_obj['id'], 'created_at' : tweet_obj['created_at'], 
                          'retweet_count' : tweet_obj['retweet_count'],'favorite_count' : tweet_obj['favorite_count'],
                          'followers_count' : tweet_obj['user']['followers_count']}
            if tweet_dict not in json_list:
                json_list.append(tweet_dict)


                
                  #glimpse of the first 2 items in the list

json_list[0:2]

[{'tweet_id': 892177421306343426,
  'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
  'retweet_count': 6514,
  'favorite_count': 33819,
  'followers_count': 3200889},
 {'tweet_id': 891689557279858688,
  'created_at': 'Sun Jul 30 15:58:51 +0000 2017',
  'retweet_count': 8964,
  'favorite_count': 42908,
  'followers_count': 3200889}]


                
                  #Type of the zero-indexed object in the json_list list

assert(type(json_list[0]) == dict)


                
                  #Type of last-indexed object in the json_list

isinstance(json_list[-1], dict)

True


                
                  additional_tweets_df = pd.DataFrame.from_dict(json_list)


                
                  #use for loop to assert presence of subset

for tweetID in list(additional_tweets_df.tweet_id):
    assert tweetID in list(archives_df.tweet_id)


                
                  #Expand the size of the output display

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)


                
                  archives_df.sample(5)


                
                  archives_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


                
                  #dataframes' numbers of rows and columns

archives_df.shape

(2356, 17)


                
                  archives_df.describe()


                
                  archives_df['tweet_id'].duplicated().sum()

0


                
                  archives_df.duplicated().sum()

0


                
                  #unique items in name column

archives_df.name.value_counts()

None              745
a                  55
Charlie            12
Cooper             11
Lucy               11
Oliver             11
Tucker             10
Penny              10
Lola               10
Winston             9
an                  7
Bailey              7
Leo                 6
Oscar               6
Finn                5
George              5
Larry               5
Clarence            4
Loki                4
Walter              4

Name: name, dtype: int64


                
                  archives_df[archives_df.name == 'a'].sample(3)


                
                  archives_df[archives_df.name == 'an'].sample(3)


                
                  sum(archives_df.in_reply_to_status_id.isnull())

2278


                
                  sum(archives_df.in_reply_to_user_id.isnull())

2278


                
                  sum(archives_df.retweeted_status_id.isnull())

2175


                
                  sum(archives_df.retweeted_status_user_id.isnull())

2175


                
                  archives_df.rating_denominator.value_counts()

10     2333
11        3
50        3
20        2
80        2
70        1
7         1
15        1
150       1
170       1
0         1
90        1
40        1
130       1
110       1
16        1
120       1
2         1
Name: rating_denominator, dtype: int64


                
                  #retrieve a sanple of 10 rows from dataframe

predictions_df.sample(10)


                
                  predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


                
                  #retrieve descriptive statistics

predictions_df.describe()


                
                  #Check for duplicates

predictions_df.tweet_id.duplicated().sum()

0


                
                  additional_tweets_df.sample(10)


                
                  additional_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         1177 non-null   int64 
 1   created_at       1177 non-null   object
 2   retweet_count    1177 non-null   int64 
 3   favorite_count   1177 non-null   int64 
 4   followers_count  1177 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 46.1+ KB


                
                  #retrieve descriptive stats

additional_tweets_df.describe()


                
                  #check for duplicates

additional_tweets_df.duplicated().sum()

0


                
                  #Check for common column names between the 3 tables 

column_series = pd.Series(list(archives_df) + list(additional_tweets_df) + list(predictions_df))
column_series[column_series.duplicated()]

17    tweet_id
22    tweet_id
dtype: object


                
                  archives_clean = archives_df.copy()
predictions_clean = predictions_df.copy()
additional_clean = additional_tweets_df.copy()


                
                  #filter rows with values in retweeted_status_id column

archives_clean[archives_clean['retweeted_status_id'].notnull()]


                
                  #create a new df that exxludes the filtered rows

archives_clean = archives_clean[archives_clean['retweeted_status_id'].isnull()]


                
                  #ensure that the count of rows with retweeted_status_id(retweets) is zero

assert(archives_clean['retweeted_status_id'].notnull().sum() == 0)


                
                  #use astype method to convert to desired datatype

archives_clean['tweet_id'] = archives_clean['tweet_id'].astype('str')
additional_clean['tweet_id'] = additional_clean['tweet_id'].astype('str')
predictions_clean['tweet_id'] = predictions_clean['tweet_id'].astype('str')


                
                  #confirm that each tweet_id is of class object

assert(archives_clean['tweet_id'].dtype == 'O')
assert(additional_clean['tweet_id'].dtype == 'O')
assert(predictions_clean['tweet_id'].dtype == 'O')


                
                  #Retrieve a sample of the timestamp column

archives_clean.timestamp.sample(1)

1903    2015-12-09 17:15:54 +0000
Name: timestamp, dtype: object


                
                  # use to_datetime function to convert str to a pandas datetime object 

archives_clean['timestamp'] = pd.to_datetime(archives_clean.timestamp)


                
                  #confirm that the resulting column is of datetime - UTC timezone datatype

assert(archives_clean['timestamp'].dtype == 'datetime64[ns, UTC]')


                
                  archives_clean.sample(3)


                
                  #List of columns to be dropped

to_be_deleted = ['in_reply_to_user_id', 'in_reply_to_status_id', 
                  'retweeted_status_timestamp', 'retweeted_status_id', 
                  'retweeted_status_user_id']


                
                  archives_clean = archives_clean.drop(columns=to_be_deleted)


                
                  #checks if any of the columns in the new df are in to_be_deleted list
#Returns ''Specified columns were all deleted'' if specified columns were deleted

def check_columns():
    for column in archives_clean.columns:
        if column in to_be_deleted:
            return ('{} was not deleted'.format(column))
        return('Specified columns were all deleted')
check_columns()

'Specified columns were all deleted'


                
                  '''
Split the string using '>' and use indexing to pick one-indexed obejct
split the resulting object on a whitespace and pick the last item
strip whitespace from the string
finally srip the closing anchor tag from the str

'''

archives_clean['source'] = archives_clean['source'].apply(lambda x: x.split('>')[1].split()[-1].strip().strip('</a'))


                
                  #Count the number of unique items

archives_clean['source'].value_counts()

iPhone       2042
Scene          91
Client         31
TweetDeck      11
Name: source, dtype: int64


                
                  #Check for unique items in source column

archives_clean['source'].unique()

array(['iPhone', 'Client', 'Scene', 'TweetDeck'], dtype=object)


                
                  # replace every instance of the word None with 'No Name'

archives_clean['name'] = archives_clean['name'].replace('None', 'No_Name')


                
                  #confirm that None is not the name column

assert None not in archives_clean['name'].unique()


                
                  #Count the total number of unique names in the archives_clean dataframe:

len(list(archives_clean.name.unique()))

956


                
                  archives_clean.name.unique()

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', 'No_Name',
       'Jax', 'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart',
       'Oliver', 'Jim', 'Zeke', 'Ralphus', 'Gerald', 'Jeffrey', 'such',
       'Canela', 'Maya', 'Mingus', 'Derek', 'Roscoe', 'Waffles', 'Jimbo',
       'Maisey', 'Earl', 'Lola', 'Kevin', 'Yogi', 'Noah', 'Bella',
       'Grizzwald', 'Rusty', 'Gus', 'Stanley', 'Alfy', 'Koko', 'Rey',
       'Gary', 'a', 'Elliot', 'Louis', 'Jesse', 'Romeo', 'Bailey',
       'Duddles', 'Jack', 'Steven', 'Beau', 'Snoopy', 'Shadow', 'Emmy',
       'Aja', 'Penny', 'Dante', 'Nelly', 'Ginger', 'Benedict', 'Venti',
       'Goose', 'Nugget', 'Cash', 'Jed', 'Sebastian', 'Sierra', 'Monkey',
       'Harry', 'Kody', 'Lassie', 'Rover', 'Napolean', 'Boomer', 'Cody',
       'Rumble', 'Clifford', 'Dewey', 'Scout', 'Gizmo', 'Walter',
       'Cooper', 'Harold', 'Shikha', 'Lili', 'Jamesy', 'Coco', 'Sammy',
       'Meatball', 'Paisley', 'Albus', 'Neptune', 'Belle', 'Quinn',
       'Zooey', 'Dave', 'Jersey', 'Hobbes', 'Burt', 'Lorenzo', 'Carl',
       'Jordy', 'Milky', 'Trooper', 'quite', 'Sophie', 'Wyatt', 'Rosie',
       'Thor', 'Oscar', 'Callie', 'Cermet', 'Marlee', 'Arya', 'Einstein',
       'Alice', 'Rumpole', 'Benny', 'Aspen', 'Jarod', 'Wiggles',
       'General', 'Sailor', 'Iggy', 'Snoop', 'Kyle', 'Leo', 'Riley',
       'Noosh', 'Odin', 'Jerry', 'Georgie', 'Rontu', 'Cannon', 'Furzey',
       'Daisy', 'Tuck', 'Barney', 'Vixen', 'Jarvis', 'Mimosa', 'Pickles',
       'Brady', 'Luna', 'Charlie', 'Margo', 'Sadie', 'Hank', 'Tycho',
       'Indie', 'Winnie', 'George', 'Bentley', 'Max', 'Dawn', 'Maddie',
       'Monty', 'Sojourner', 'Winston', 'Odie', 'Arlo', 'Vincent', 'Lucy',
       'Clark', 'Mookie', 'Meera', 'Ava', 'Eli', 'Ash', 'Tucker', 'Tobi',
       'Chester', 'Wilson', 'Sunshine', 'Lipton', 'Bronte', 'Poppy',
       'Gidget', 'Rhino', 'Willow', 'not', 'Orion', 'Eevee', 'Smiley',
       'Miguel', 'Emanuel', 'Kuyu', 'Dutch', 'Pete', 'Scooter', 'Reggie',
       'Lilly', 'Samson', 'Mia', 'Astrid', 'Malcolm', 'Dexter', 'Alfie',
       'Fiona', 'one', 'Mutt', 'Bear', 'Doobert', 'Beebop', 'Alexander',
       'Sailer', 'Brutus', 'Kona', 'Boots', 'Ralphie', 'Loki', 'Cupid',
       'Pawnd', 'Pilot', 'Ike', 'Mo', 'Toby', 'Sweet', 'Pablo', 'Nala',
       'Crawford', 'Gabe', 'Jimison', 'Duchess', 'Harlso', 'Sundance',
       'Luca', 'Flash', 'Sunny', 'Howie', 'Jazzy', 'Anna', 'Finn', 'Bo',
       'Wafer', 'Tom', 'Florence', 'Autumn', 'Buddy', 'Dido', 'Eugene',
       'Ken', 'Strudel', 'Tebow', 'Chloe', 'Timber', 'Binky', 'Moose',
       'Dudley', 'Comet', 'Akumi', 'Titan', 'Olivia', 'Alf', 'Oshie',
       'Chubbs', 'Sky', 'Atlas', 'Eleanor', 'Layla', 'Rocky', 'Baron',
       'Tyr', 'Bauer', 'Swagger', 'Brandi', 'Mary', 'Moe', 'Halo',
       'Augie', 'Craig', 'Sam', 'Hunter', 'Pavlov', 'Phil', 'Kyro',
       'Wallace', 'Ito', 'Seamus', 'Ollie', 'Stephan', 'Lennon',
       'incredibly', 'Major', 'Duke', 'Sansa', 'Shooter', 'Django',
       'Diogi', 'Sonny', 'Marley', 'Severus', 'Ronnie', 'Milo', 'Bones',
       'Mauve', 'Chef', 'Doc', 'Peaches', 'Sobe', 'Longfellow', 'Mister',
       'Iroh', 'Pancake', 'Snicku', 'Ruby', 'Brody', 'Mack', 'Nimbus',
       'Laika', 'Maximus', 'Dobby', 'Moreton', 'Juno', 'Maude', 'Lily',
       'Newt', 'Benji', 'Nida', 'Robin', 'Monster', 'BeBe', 'Remus',
       'Levi', 'Mabel', 'Misty', 'Betty', 'Mosby', 'Maggie', 'Bruce',
       'Happy', 'Ralphy', 'Brownie', 'Rizzy', 'Stella', 'Butter', 'Frank',
       'Tonks', 'Lincoln', 'Rory', 'Logan', 'Dale', 'Rizzo', 'Arnie',
       'Mattie', 'Pinot', 'Dallas', 'Hero', 'Frankie', 'Stormy',
       'Reginald', 'Balto', 'Mairi', 'Loomis', 'Godi', 'Cali', 'Deacon',
       'Timmy', 'Sampson', 'Chipson', 'Combo', 'Oakley', 'Dash',
       'Hercules', 'Jay', 'Mya', 'Strider', 'Wesley', 'Solomon', 'Huck',
       'O', 'Blue', 'Anakin', 'Finley', 'Sprinkles', 'Heinrich',
       'Shakespeare', 'Chelsea', 'Bungalo', 'Chip', 'Grey', 'Roosevelt',
       'Willem', 'Davey', 'Dakota', 'Fizz', 'Dixie', 'very', 'Al',
       'Jackson', 'Carbon', 'Klein', 'DonDon', 'Kirby', 'Lou', 'Chevy',
       'Tito', 'Philbert', 'Louie', 'Rupert', 'Rufus', 'Brudge', 'Shadoe',
       'Angel', 'Brat', 'Tove', 'my', 'Gromit', 'Aubie', 'Kota', 'Leela',
       'Glenn', 'Shelby', 'Sephie', 'Bonaparte', 'Albert', 'Wishes',
       'Rose', 'Theo', 'Rocco', 'Fido', 'Emma', 'Spencer', 'Lilli',
       'Boston', 'Brandonald', 'Corey', 'Leonard', 'Beckham', 'Devón',
       'Gert', 'Watson', 'Keith', 'Dex', 'Ace', 'Tayzie', 'Grizzie',
       'Fred', 'Gilbert', 'Meyer', 'Zoe', 'Stewie', 'Calvin', 'Lilah',
       'Spanky', 'Jameson', 'Piper', 'Atticus', 'Blu', 'Dietrich',
       'Divine', 'Tripp', 'his', 'Cora', 'Huxley', 'Keurig', 'Bookstore',
       'Linus', 'Abby', 'Shiloh', 'an', 'Gustav', 'Arlen', 'Percy',
       'Lenox', 'Sugar', 'Harvey', 'Blanket', 'actually', 'Geno', 'Stark',
       'Beya', 'Kilo', 'Kayla', 'Maxaroni', 'Bell', 'Doug', 'Edmund',
       'Aqua', 'Theodore', 'just', 'Baloo', 'Chase', 'getting', 'Nollie',
       'Rorie', 'Simba', 'Charles', 'Bayley', 'Axel', 'Storkson', 'Remy',
       'Chadrick', 'mad', 'Kellogg', 'Buckley', 'Livvie', 'Terry',
       'Hermione', 'Ralpher', 'Aldrick', 'Larry', 'this', 'unacceptable',
       'Rooney', 'Crystal', 'Ziva', 'Stefan', 'Pupcasso', 'Puff',
       'Flurpson', 'Coleman', 'Enchilada', 'Raymond', 'all', 'Rueben',
       'Cilantro', 'Karll', 'Sprout', 'Blitz', 'Bloop', 'Colby', 'Lillie',
       'Ashleigh', 'Kreggory', 'Sarge', 'Luther', 'Ivar', 'Jangle',
       'Schnitzel', 'Panda', 'Berkeley', 'Ralphé', 'Charleson', 'Clyde',
       'Harnold', 'Sid', 'Pippa', 'Otis', 'Carper', 'Bowie',
       'Alexanderson', 'Suki', 'Barclay', 'Skittle', 'Ebby', 'Flávio',
       'Smokey', 'Link', 'Jennifur', 'Ozzy', 'Bluebert', 'Stephanus',
       'Bubbles', 'old', 'Zeus', 'Bertson', 'Nico', 'Michelangelope',
       'Siba', 'Calbert', 'Curtis', 'Travis', 'Thumas', 'Kanu', 'Lance',
       'Opie', 'Stubert', 'Kane', 'Olive', 'Chuckles', 'Staniel', 'Sora',
       'Beemo', 'Gunner', 'infuriating', 'Lacy', 'Tater', 'Olaf', 'Cecil',
       'Vince', 'Karma', 'Billy', 'Walker', 'Rodney', 'Klevin', 'Malikai',
       'Bobble', 'River', 'Jebberson', 'Remington', 'Farfle', 'Jiminus',
       'Harper', 'Clarkus', 'Finnegus', 'Cupcake', 'Kathmandu', 'Ellie',
       'Katie', 'Kara', 'Adele', 'Zara', 'Ambrose', 'Jimothy', 'Bode',
       'Terrenth', 'Reese', 'Chesterson', 'Lucia', 'Bisquick', 'Ralphson',
       'Socks', 'Rambo', 'Rudy', 'Fiji', 'Rilo', 'Bilbo', 'Coopson',
       'Yoda', 'Millie', 'Chet', 'Crouton', 'Daniel', 'Kaia', 'Murphy',
       'Dotsy', 'Eazy', 'Coops', 'Fillup', 'Miley', 'Charl', 'Reagan',
       'Yukon', 'CeCe', 'Cuddles', 'Claude', 'Jessiga', 'Carter', 'Ole',
       'Pherb', 'Blipson', 'Reptar', 'Trevith', 'Berb', 'Bob', 'Colin',
       'Brian', 'Oliviér', 'Grady', 'Kobe', 'Freddery', 'Bodie', 'Dunkin',
       'Wally', 'Tupawc', 'Amber', 'Herschel', 'Edgar', 'Teddy',
       'Kingsley', 'Brockly', 'Richie', 'Molly', 'Vinscent', 'Cedrick',
       'Hazel', 'Lolo', 'Eriq', 'Phred', 'the', 'Oddie', 'Maxwell',
       'Geoff', 'Covach', 'Durg', 'Fynn', 'Ricky', 'Herald', 'Lucky',
       'Ferg', 'Trip', 'Clarence', 'Hamrick', 'Brad', 'Pubert', 'Frönq',
       'Derby', 'Lizzie', 'Ember', 'Blakely', 'Opal', 'Marq', 'Kramer',
       'Barry', 'Tyrone', 'Gordon', 'Baxter', 'Mona', 'Horace', 'Crimson',
       'Birf', 'Hammond', 'Lorelei', 'Marty', 'Brooks', 'Petrick',
       'Hubertson', 'Gerbald', 'Oreo', 'Bruiser', 'Perry', 'Bobby',
       'Jeph', 'Obi', 'Tino', 'Kulet', 'Sweets', 'Lupe', 'Tiger',
       'Jiminy', 'Griffin', 'Banjo', 'Brandy', 'Lulu', 'Darrel', 'Taco',
       'Joey', 'Patrick', 'Kreg', 'Todo', 'Tess', 'Ulysses', 'Toffee',
       'Apollo', 'Carly', 'Asher', 'Glacier', 'Chuck', 'Champ', 'Ozzie',
       'Griswold', 'Cheesy', 'Moofasa', 'Hector', 'Goliath', 'Kawhi',
       'by', 'Emmie', 'Penelope', 'Willie', 'Rinna', 'Mike', 'William',
       'Dwight', 'Evy', 'Hurley', 'Rubio', 'officially', 'Chompsky',
       'Rascal', 'Linda', 'Tug', 'Tango', 'Grizz', 'Jerome', 'Crumpet',
       'Jessifer', 'Izzy', 'Ralph', 'Sandy', 'Humphrey', 'Tassy',
       'Juckson', 'Chuq', 'Tyrus', 'Karl', 'Godzilla', 'Vinnie',
       'Kenneth', 'Herm', 'Bert', 'Striker', 'Donny', 'Pepper', 'Bernie',
       'Buddah', 'Lenny', 'Arnold', 'Zuzu', 'Mollie', 'Laela', 'Tedders',
       'Superpup', 'Rufio', 'Jeb', 'Rodman', 'Jonah', 'Chesney', 'life',
       'Kenny', 'Henry', 'Bobbay', 'Mitch', 'Kaiya', 'Acro', 'Aiden',
       'Obie', 'Dot', 'Shnuggles', 'Kendall', 'Jeffri', 'Steve', 'Eve',
       'Mac', 'Fletcher', 'Kenzie', 'Pumpkin', 'Schnozz', 'Gustaf',
       'Cheryl', 'Ed', 'Leonidas', 'Norman', 'Caryl', 'Scott', 'Taz',
       'Darby', 'Jackie', 'light', 'Jazz', 'Franq', 'Pippin', 'Rolf',
       'Snickers', 'Ridley', 'Cal', 'Bradley', 'Bubba', 'Tuco', 'Patch',
       'Mojo', 'Batdog', 'Dylan', 'space', 'Mark', 'JD', 'Alejandro',
       'Scruffers', 'Pip', 'Julius', 'Tanner', 'Sparky', 'Anthony',
       'Holly', 'Jett', 'Amy', 'Sage', 'Andy', 'Mason', 'Trigger',
       'Antony', 'Creg', 'Traviss', 'Gin', 'Jeffrie', 'Danny', 'Ester',
       'Pluto', 'Bloo', 'Edd', 'Paull', 'Willy', 'Herb', 'Damon',
       'Peanut', 'Nigel', 'Butters', 'Sandra', 'Fabio', 'Randall', 'Liam',
       'Tommy', 'Ben', 'Raphael', 'Julio', 'Andru', 'Kloey', 'Shawwn',
       'Skye', 'Kollin', 'Ronduh', 'Billl', 'Saydee', 'Dug', 'Tessa',
       'Sully', 'Kirk', 'Ralf', 'Clarq', 'Jaspers', 'Samsom', 'Terrance',
       'Harrison', 'Chaz', 'Jeremy', 'Jaycob', 'Lambeau', 'Ruffles',
       'Amélie', 'Bobb', 'Banditt', 'Kevon', 'Winifred', 'Hanz',
       'Churlie', 'Zeek', 'Timofy', 'Maks', 'Jomathan', 'Kallie',
       'Marvin', 'Spark', 'Gòrdón', 'Jo', 'DayZ', 'Jareld', 'Torque',
       'Ron', 'Skittles', 'Cleopatricia', 'Erik', 'Stu', 'Tedrick',
       'Shaggy', 'Filup', 'Kial', 'Naphaniel', 'Dook', 'Hall', 'Philippe',
       'Biden', 'Fwed', 'Genevieve', 'Joshwa', 'Timison', 'Bradlay',
       'Pipsy', 'Clybe', 'Keet', 'Carll', 'Jockson', 'Josep', 'Lugan',
       'Christoper'], dtype=object)


                
                  #filter all lower case names and return unique words

lower_case = archives_clean[archives_clean.name.str.islower()].name.unique()
print(lower_case)

['such' 'a' 'quite' 'not' 'one' 'incredibly' 'very' 'my' 'his' 'an'
 'actually' 'just' 'getting' 'mad' 'this' 'unacceptable' 'all' 'old'
 'infuriating' 'the' 'by' 'officially' 'life' 'light' 'space']


                
                  #Count of each lower case word

archives_clean[archives_clean.name.str.islower()].name.value_counts()

a               55
the              8
an               6
one              4
very             4
quite            3
just             3
getting          2
not              2
actually         2
old              1
light            1
life             1
officially       1
by               1
infuriating      1
such             1
all              1
unacceptable     1
this             1
mad              1
his              1
my               1
incredibly       1
space            1
Name: name, dtype: int64


                
                  #replace values in inidicated list with indicated value

archives_clean['name'] = archives_clean['name'].replace(lower_case, 'incorrect_name')


                
                  #Loop through lower_case elements and confirm that they are not in name column:

for name in lower_case:
    assert name not in list(archives_clean['name'].unique())


                
                  archives_clean[archives_clean['rating_denominator'] < 10]


                
                  archives_clean.loc[archives_clean['tweet_id'] == '835246439529840640', 'rating_denominator'] = 10


                
                  archives_clean.loc[archives_clean['tweet_id'] == '810984652412424192', 'rating_denominator'] = np.nan


                
                  archives_clean.loc[archives_clean['tweet_id'] == '666287406224695296', 'rating_denominator'] = 10


                
                  #Filter Text rating_numerator and rating_denominator columns where the rating_denominator is greater that 10

archives_clean.loc[archives_clean['rating_denominator'] > 10,['tweet_id','text', 'rating_numerator','rating_denominator']]


                
                  #dog - @docmisterio

archives_clean.loc[archives_clean['tweet_id'] == '832088576586297345', 'rating_denominator'] = np.nan


                
                  #dog -Bretagne

archives_clean.loc[archives_clean['tweet_id'] == '740373189193256964', 'rating_denominator'] = 10


                
                  #dog - Bluebert

archives_clean.loc[archives_clean['tweet_id'] == '716439118184652801', 'rating_denominator'] = 10


                
                  #dog - Darell

archives_clean.loc[archives_clean['tweet_id'] == '682962037429899265', 'rating_denominator'] = 10


                
                  #unique entries in the rating denominator

archives_clean.rating_denominator.unique()

array([ 10.,  nan,  70., 150., 170.,  20.,  90.,  80.,  50.,  40., 130.,
       110.,  16., 120.])


                
                  #Test for values greater than 10 and nan in rating _denominator

for num in archives_clean.rating_denominator:
    assert ((num >= 10) or np.isnan(num))


                
                  #use drop method

additional_clean = additional_clean.drop(columns='created_at')


                
                  assert 'created_at' not in additional_clean.columns


                
                  archives_clean.groupby(["doggo", "floofer", "pupper", "puppo"]).size().reset_index().rename(columns={0: "count"})


                
                  #doggo column

archives_clean.doggo.replace('None', '', inplace=True)
archives_clean.doggo.replace('NaN', '', inplace=True)


                
                  #floofer column

archives_clean.floofer.replace('None', '', inplace=True)
archives_clean.floofer.replace('NaN', '', inplace=True)


                
                  #pupper column

archives_clean.pupper.replace('None', '', inplace=True)
archives_clean.pupper.replace('NaN', '', inplace=True)


                
                  #puppo column

archives_clean.puppo.replace('None', '', inplace=True)
archives_clean.puppo.replace('NaN', '', inplace=True)


                
                  archives_clean.groupby(["doggo", "floofer", "pupper", "puppo"]).size().reset_index().rename(columns={0: "count"})


                
                  archives_clean['dog_category'] = archives_clean.pupper + archives_clean.doggo + archives_clean.puppo+ archives_clean.floofer


                
                  #Count of unique items in our newly created dataframe

archives_clean['dog_category'].value_counts()

                1831
pupper           224
doggo             75
puppo             24
pupperdoggo       10
floofer            9
doggopuppo         1
doggofloofer       1
Name: dog_category, dtype: int64


                
                  archives_clean.loc[archives_clean['dog_category'] == 'doggopuppo', 'text']

191    Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel
Name: text, dtype: object


                
                  #correct stage name is Puppo

archives_clean.loc[archives_clean['dog_category'] == 'doggopuppo', 'dog_category'] = 'puppo'


                
                  archives_clean.loc[archives_clean['dog_category'] == 'doggofloofer', 'text']

200    At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk
Name: text, dtype: object


                
                  #correct name is Floofer

archives_clean.loc[archives_clean['dog_category'] == 'doggofloofer', 'dog_category'] = 'floofer'


                
                  #pupperdoggo stage

archives_clean.loc[archives_clean['dog_category'] == 'pupperdoggo', 'text']

460     This is Dido. She's playing the lead role in "Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple." 13/10 (IG: didodoggo) https://t.co/m7isZrOBX7
531                         Here we have Burke (pupper) and Dexter (doggo). Pupper wants to be exactly like doggo. Both 12/10 would pet at same time https://t.co/ANBpEYHaho
565                                                                                                    Like doggo, like pupper version 2. Both 11/10 https://t.co/9IxWAXFqze
575                         This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj
705     This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiously https://t.co/f2wmLZTPHd
733                                                                                                               Pupper butt 1, Doggo 0. Both 12/10 https://t.co/WQvcPEpH2u
889                         Meet Maggie &amp; Lila. Maggie is the doggo, Lila is the pupper. They are sisters. Both 12/10 would pet at the same time https://t.co/MYwR4DQKll
956                                 Please stop sending it pictures that don't even have a doggo or pupper in them. Churlish af. 5/10 neat couch tho https://t.co/u2c9c7qSg8
1063                                                                             This is just downright precious af. 12/10 for both pupper and doggo https://t.co/o5J479bZUC
1113                                                                                              Like father (doggo), like son (pupper). Both 12/10 https://t.co/pG2inLaOda
Name: text, dtype: object


                
                  archives_clean.loc[archives_clean['dog_category'] == 'pupperdoggo', 'dog_category'] = 'pupper, doggo'


                
                  archives_clean = archives_clean.drop(columns=[ 'doggo', 'floofer', 'pupper', 'puppo'])


                
                  #unique items in dog category column 

archives_clean['dog_category'].value_counts()

                 1831
pupper            224
doggo              75
puppo              25
floofer            10
pupper, doggo      10
Name: dog_category, dtype: int64


                
                  #assert that we dont have None entries in dog_category column

assert None not in archives_clean['dog_category']


                
                  #confirm that deleted columns no longer exist

for col in ['doggo', 'floofer', 'pupper', 'puppo']:
    assert col not in list(archives_clean.columns)


                
                  #Left join the archives_clean on predictions_clean on tweet_id column 

twitter_archive_master = archives_clean.merge(predictions_clean, on='tweet_id', how='left' )


                
                  #Left Join twitter_archive_master table with additional_clean on tweet_id column

twitter_archive_master = twitter_archive_master.merge(additional_clean, on='tweet_id', how='left' )


                
                  for col in list(predictions_clean.columns):
    assert col in list(twitter_archive_master.columns)


                
                  for col in list(additional_clean.columns):
    assert col in list(twitter_archive_master.columns)


                
                  for col in list(archives_clean.columns):
    assert col in list(twitter_archive_master.columns)


                
                  twitter_archive_master.to_csv('twitter_archive_master.csv', index=False)


                
                  #Summary stats of the combined dataframe

twitter_archive_master.describe()


                
                  # convert timestamp column to day of week name using the day_name method 

twitter_archive_master['dayofweek'] = twitter_archive_master['timestamp'].dt.day_name()


                
                  #change the datatype of column to categorical

twitter_archive_master['dayofweek'].astype('category')

0         Tuesday
1         Tuesday
2          Monday
3          Sunday
4        Saturday
5        Saturday
6          Friday
7          Friday
8        Thursday
9       Wednesday
10      Wednesday
11        Tuesday
12        Tuesday
13        Tuesday
14         Monday
15         Monday
16         Sunday
17       Saturday
18       Saturday
19       Thursday
20      Wednesday

Name: dayofweek, dtype: category
Categories (7, object): ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']


                
                  #list of unique weekday names

days = twitter_archive_master['dayofweek'].unique()


                
                  #retrieve weekday counts

counts = twitter_archive_master['dayofweek'].value_counts()


                
                  counts

Monday       357
Tuesday      326
Wednesday    322
Friday       305
Thursday     305
Saturday     284
Sunday       276
Name: dayofweek, dtype: int64


                
                  #display the highest count:
    
counts.max()

357


                
                  plt.figure(figsize = (11,8))

ordered_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

daybar = sns.countplot(x=twitter_archive_master['dayofweek'], order=ordered_days)

for bar in daybar.containers:
    daybar.bar_label(bar)
   
daybar.set(title='Number of tweets per day',
          ylabel='Count of Tweets',
          xlabel='Day of the week');


                
                  #group timestamp column into 'Morning' and 'Afternoon'

time_diff = twitter_archive_master['favorite_count' ].groupby(twitter_archive_master['timestamp'].dt.hour <=12).sum().rename(
    {True: 'Morning', False: 'Afternoon'})


                
                  #Plot bar chart to visualize the time_diff findings

plt.figure(figsize = (7,8))
time_diff_bar= time_diff.plot.bar(xlabel='Time of the day', 
                                  color=['#AE2AEC', '#17B2F6'],
                                  ylabel='Total Favourite Counts', 
                                  title = 'Comparison of favourite Counts per time of Day',
                                  rot=45, 
                                  fontsize=12,
                                  width=0.95,
                                  grid=False
                                  
                             )


                
                  time_diff

timestamp
Afternoon    5072153.0
Morning      4227538.0
Name: favorite_count, dtype: float64


                
                  'Tweets sent out in the morning received a total of {:.0f} likes whereas those sent in the afternoon had a total of {:.0f}'.format(time_diff['Morning'], time_diff['Afternoon'])

'Tweets sent out in the morning received a total of 4227538 likes whereas those sent in the afternoon had a total of 5072153'


                
                  #Count the occurrence of each name in the dataset

twitter_archive_master['name'].value_counts()

No_Name           680
incorrect_name    104
Lucy               11
Charlie            11
Oliver             10
Cooper             10
Tucker              9
Penny               9
Lola                8
Sadie               8
Winston             8
Toby                7
Name: name, dtype: int64


                
                  #Total count of values in the name column

named = twitter_archive_master['name'].count()


                
                  #Number of unnamed dogs

unnamed = twitter_archive_master['name'].value_counts()['No_Name'] + twitter_archive_master['name'].value_counts()['incorrect_name']


                
                  'Unnamed dogs represented {:.2f}% of the total count, which was the highest portion of the sample'.format((unnamed/named) *100)

'Unnamed dogs represented 36.05% of the total count, which was the highest portion of the sample'

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	retweeted_status_id	retweeted_status_user_id	rating_numerator	rating_denominator
count	2.356000e+03	7.800000e+01	7.800000e+01	1.810000e+02	1.810000e+02	2356.000000	2356.000000
mean	7.427716e+17	7.455079e+17	2.014171e+16	7.720400e+17	1.241698e+16	13.126486	10.455433
std	6.856705e+16	7.582492e+16	1.252797e+17	6.236928e+16	9.599254e+16	45.876648	6.745237
min	6.660209e+17	6.658147e+17	1.185634e+07	6.661041e+17	7.832140e+05	0.000000	0.000000
25%	6.783989e+17	6.757419e+17	3.086374e+08	7.186315e+17	4.196984e+09	10.000000	10.000000
50%	7.196279e+17	7.038708e+17	4.196984e+09	7.804657e+17	4.196984e+09	11.000000	10.000000
75%	7.993373e+17	8.257804e+17	4.196984e+09	8.203146e+17	4.196984e+09	12.000000	10.000000
max	8.924206e+17	8.862664e+17	8.405479e+17	8.874740e+17	7.874618e+17	1776.000000	170.000000

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
22	666337882303524864	https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg	1	ox	0.416669	False	Newfoundland	0.278407	True	groenendael	0.102643	True
1653	809920764300447744	https://pbs.twimg.com/media/Cz1qo05XUAQ4qXp.jpg	1	Norwich_terrier	0.397163	True	toy_poodle	0.274540	True	miniature_poodle	0.134667	True
111	667902449697558528	https://pbs.twimg.com/media/CUTdvAJXIAAMS4q.jpg	1	Norwegian_elkhound	0.298881	True	malamute	0.279479	True	Eskimo_dog	0.198428	True
309	671538301157904385	https://pbs.twimg.com/media/CVHIhi2WsAEgdKk.jpg	1	park_bench	0.194211	False	water_bottle	0.071870	False	beacon	0.053433	False
349	672482722825261057	https://pbs.twimg.com/media/CVUjd14W4AE8tvO.jpg	1	West_Highland_white_terrier	0.586173	True	borzoi	0.206620	True	Great_Pyrenees	0.060653	True
32	666428276349472768	https://pbs.twimg.com/media/CT-g-0DUwAEQdSn.jpg	1	Pembroke	0.371361	True	chow	0.249394	True	Pomeranian	0.241878	True
242	670449342516494336	https://pbs.twimg.com/media/CU3qHNTWsAApGr0.jpg	1	peacock	0.999924	False	European_gallinule	0.000030	False	agama	0.000022	False
854	696488710901260288	https://pbs.twimg.com/media/CapsyfkWcAQ41uC.jpg	1	briard	0.369063	True	Scotch_terrier	0.168204	True	giant_schnauzer	0.120553	True
173	669006782128353280	https://pbs.twimg.com/media/CUjKHs0WIAECWP3.jpg	1	Chihuahua	0.127178	True	Italian_greyhound	0.054215	True	pillow	0.048592	False
1068	715928423106027520	https://pbs.twimg.com/media/Ce99GhLW8AAHG38.jpg	1	pug	0.976685	True	French_bulldog	0.019663	True	bull_mastiff	0.002278	True

	tweet_id	img_num	p1_conf	p2_conf	p3_conf
count	2.075000e+03	2075.000000	2075.000000	2.075000e+03	2.075000e+03
mean	7.384514e+17	1.203855	0.594548	1.345886e-01	6.032417e-02
std	6.785203e+16	0.561875	0.271174	1.006657e-01	5.090593e-02
min	6.660209e+17	1.000000	0.044333	1.011300e-08	1.740170e-10
25%	6.764835e+17	1.000000	0.364412	5.388625e-02	1.622240e-02
50%	7.119988e+17	1.000000	0.588230	1.181810e-01	4.944380e-02
75%	7.932034e+17	1.000000	0.843855	1.955655e-01	9.180755e-02
max	8.924206e+17	4.000000	1.000000	4.880140e-01	2.734190e-01

	tweet_id	created_at	retweet_count	favorite_count	followers_count
1124	667806454573760512	Fri Nov 20 20:47:20 +0000 2015	535	1111	3201016
1140	667200525029539841	Thu Nov 19 04:39:35 +0000 2015	282	658	3201017
58	869988702071779329	Wed May 31 18:47:24 +0000 2017	25661	0	3200891
106	851464819735769094	Mon Apr 10 16:00:07 +0000 2017	7855	25944	3200891
496	748346686624440324	Thu Jun 30 02:45:28 +0000 2016	1413	5735	3200943
132	841833993020538882	Wed Mar 15 02:10:39 +0000 2017	17504	0	3200891
881	678410210315247616	Sun Dec 20 03:02:53 +0000 2015	2072	4640	3200954
404	771380798096281600	Thu Sep 01 16:14:48 +0000 2016	5912	11746	3200905
936	675113801096802304	Fri Dec 11 00:44:07 +0000 2015	877	2120	3201003
759	690690673629138944	Sat Jan 23 00:21:03 +0000 2016	898	2547	3200951

	tweet_id	retweet_count	favorite_count	followers_count
count	1.177000e+03	1177.000000	1177.000000	1.177000e+03
mean	7.426493e+17	3107.338997	7901.181818	3.200942e+06
std	6.851760e+16	5458.476288	11536.383916	4.432643e+01
min	6.660209e+17	2.000000	0.000000	3.200802e+06
25%	6.783968e+17	613.000000	1339.000000	3.200898e+06
50%	7.193678e+17	1460.000000	3553.000000	3.200945e+06
75%	7.992971e+17	3628.000000	9872.000000	3.200953e+06
max	8.921774e+17	79515.000000	131075.000000	3.201018e+06

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
2260	667550882905632768	NaN	NaN	2015-11-20 03:51:47 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	RT @dogratingrating: Unoriginal idea. Blatant plagiarism. Curious grammar. -5/10 https://t.co/r7XzeQZWzb	6.675484e+17	4.296832e+09	2015-11-20 03:41:59 +0000	https://twitter.com/dogratingrating/status/667548415174144001/photo/1,https://twitter.com/dogratingrating/status/667548415174144001/photo/1	5	10	None	None	None	None	None
1613	685315239903100929	NaN	NaN	2016-01-08 04:21:00 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I would like everyone to appreciate this pup's face as much as I do. 11/10 https://t.co/QIe7oxkSNo	NaN	NaN	NaN	https://twitter.com/dog_rates/status/685315239903100929/photo/1,https://twitter.com/dog_rates/status/685315239903100929/photo/1	11	10	None	None	None	None	None
2263	667544320556335104	NaN	NaN	2015-11-20 03:25:43 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	This is Kial. Kial is either wearing a cape, which would be rad, or flashing us, which would be rude. 10/10 or 4/10 https://t.co/8zcwIoiuqR	NaN	NaN	NaN	https://twitter.com/dog_rates/status/667544320556335104/photo/1	10	10	Kial	None	None	None	None
1347	704364645503647744	NaN	NaN	2016-02-29 17:56:32 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Say hello to Bisquick. He's a Beneplop Cumbersnug. Even smiles when wet. 12/10 I'd steal Bisquick https://t.co/5zX5XD3i6K	NaN	NaN	NaN	https://twitter.com/dog_rates/status/704364645503647744/photo/1,https://twitter.com/dog_rates/status/704364645503647744/photo/1	12	10	Bisquick	None	None	None	None
973	750071704093859840	NaN	NaN	2016-07-04 21:00:04 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Pause your cookout and admire this pupper's nifty hat. 10/10 https://t.co/RG4C9IdNJM	NaN	NaN	NaN	https://twitter.com/dog_rates/status/750071704093859840/photo/1,https://twitter.com/dog_rates/status/750071704093859840/photo/1,https://twitter.com/dog_rates/status/750071704093859840/photo/1	10	10	None	None	None	pupper	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
2211	668614819948453888	NaN	NaN	2015-11-23 02:19:29 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Here is a horned dog. Much grace. Can jump over moons (dam!). Paws not soft. Bad at barking. 7/10 can still pet tho https://t.co/2Su7gmsnZm	NaN	NaN	NaN	https://twitter.com/dog_rates/status/668614819948453888/photo/1	7	10	a	None	None	None	None
2327	666407126856765440	NaN	NaN	2015-11-17 00:06:54 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is a southern Vesuvius bumblegruff. Can drive a truck (wow). Made friends with 5 other nifty dogs (neat). 7/10 https://t.co/LopTBkKa8h	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666407126856765440/photo/1	7	10	a	None	None	None	None
1368	702539513671897089	NaN	NaN	2016-02-24 17:04:07 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is a Wild Tuscan Poofwiggle. Careful not to startle. Rare tongue slip. One eye magical. 12/10 would def pet https://t.co/4EnShAQjv6	NaN	NaN	NaN	https://twitter.com/dog_rates/status/702539513671897089/photo/1,https://twitter.com/dog_rates/status/702539513671897089/photo/1,https://twitter.com/dog_rates/status/702539513671897089/photo/1	12	10	a	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
2204	668636665813057536	NaN	NaN	2015-11-23 03:46:18 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is an Irish Rigatoni terrier named Berta. Completely made of rope. No eyes. Quite large. Loves to dance. 10/10 https://t.co/EM5fDykrJg	NaN	NaN	NaN	https://twitter.com/dog_rates/status/668636665813057536/photo/1	10	10	an	None	None	None	None
759	778396591732486144	NaN	NaN	2016-09-21 00:53:04 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @dog_rates: This is an East African Chalupa Seal. We only rate dogs. Please only send in dogs. Thank you... 10/10 https://t.co/iHe6liLwWR	7.030419e+17	4.196984e+09	2016-02-26 02:20:37 +0000	https://twitter.com/dog_rates/status/703041949650034688/photo/1,https://twitter.com/dog_rates/status/703041949650034688/photo/1	10	10	an	None	None	None	None
2333	666337882303524864	NaN	NaN	2015-11-16 19:31:45 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is an extremely rare horned Parthenon. Not amused. Wears shoes. Overall very nice. 9/10 would pet aggressively https://t.co/QpRjllzWAL	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666337882303524864/photo/1	9	10	an	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
19	888202515573088257	NaN	NaN	2017-07-21 01:02:36 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @dog_rates: This is Canela. She attempted some fancy porch pics. They were unsuccessful. 13/10 someone help her https://t.co/cLyzpcUcMX	8.874740e+17	4.196984e+09	2017-07-19 00:47:34 +0000	https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1,https://twitter.com/dog_rates/status/887473957103951883/photo/1	13	10	Canela	None	None	None	None
32	886054160059072513	NaN	NaN	2017-07-15 02:45:48 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @Athletics: 12/10 #BATP https://t.co/WxwJmvjfxo	8.860537e+17	1.960740e+07	2017-07-15 02:44:07 +0000	https://twitter.com/dog_rates/status/886053434075471873,https://twitter.com/dog_rates/status/886053434075471873	12	10	None	None	None	None	None
36	885311592912609280	NaN	NaN	2017-07-13 01:35:06 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @dog_rates: This is Lilly. She just parallel barked. Kindly requests a reward now. 13/10 would pet so well https://t.co/SATN4If5H5	8.305833e+17	4.196984e+09	2017-02-12 01:04:29 +0000	https://twitter.com/dog_rates/status/830583320585068544/photo/1,https://twitter.com/dog_rates/status/830583320585068544/photo/1,https://twitter.com/dog_rates/status/830583320585068544/photo/1,https://twitter.com/dog_rates/status/830583320585068544/photo/1	13	10	Lilly	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
143	864197398364647424	NaN	NaN	2017-05-15 19:14:50+00:00	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Paisley. She ate a flower just to prove she could. Savage af. 13/10 would pet so well https://t.co/cPq9fYvkzr	NaN	NaN	NaN	https://twitter.com/dog_rates/status/864197398364647424/photo/1,https://twitter.com/dog_rates/status/864197398364647424/photo/1,https://twitter.com/dog_rates/status/864197398364647424/photo/1,https://twitter.com/dog_rates/status/864197398364647424/photo/1	13	10	Paisley	None	None	None	None
1274	709198395643068416	NaN	NaN	2016-03-14 02:04:08+00:00	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK	NaN	NaN	NaN	https://twitter.com/dog_rates/status/709198395643068416/photo/1	45	50	None	None	None	None	None
1321	706310011488698368	NaN	NaN	2016-03-06 02:46:44+00:00	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Here's a very sleepy pupper. Thinks it's an airplane. 12/10 would snug for eternity https://t.co/GGmcTIkBbf	NaN	NaN	NaN	https://twitter.com/dog_rates/status/706310011488698368/photo/1,https://twitter.com/dog_rates/status/706310011488698368/photo/1	12	10	None	None	None	pupper	None

	tweet_id	timestamp	source	text	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
313	835246439529840640	2017-02-24 21:54:03+00:00	iPhone	@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho	NaN	960	0	No_Name	None	None	None	None
516	810984652412424192	2016-12-19 23:06:23+00:00	iPhone	Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx	https://www.gofundme.com/sams-smile,https://twitter.com/dog_rates/status/810984652412424192/photo/1	24	7	Sam	None	None	None	None
2335	666287406224695296	2015-11-16 16:11:11+00:00	iPhone	This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv	https://twitter.com/dog_rates/status/666287406224695296/photo/1	1	2	incorrect_name	None	None	None	None

	tweet_id	text	rating_numerator	rating_denominator
342	832088576586297345	@docmisterio account started on 11/15/15	11	15.0
433	820690176645140481	The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd	84	70.0
902	758467244762497024	Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE	165	150.0
1068	740373189193256964	After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ	9	11.0
1120	731156023742988288	Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv	204	170.0
1165	722974582966214656	Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a	4	20.0
1202	716439118184652801	This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq	50	50.0
1228	713900603437621249	Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1	99	90.0
1254	710658690886586372	Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12	80	80.0
1274	709198395643068416	From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK	45	50.0
1351	704054845121142784	Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa	60	50.0
1433	697463031882764288	Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ	44	40.0
1598	686035780142297088	Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating	4	20.0
1634	684225744407494656	Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3	143	130.0
1635	684222868335505415	Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55	121	110.0
1662	682962037429899265	This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5	7	11.0
1663	682808988178739200	I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible	20	16.0
1779	677716515794329600	IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq	144	120.0
1843	675853064436391936	Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw	88	80.0

	doggo	floofer	pupper	puppo	count
0					1831
1				puppo	24
2			pupper		224
3		floofer			9
4	doggo				75
5	doggo			puppo	1
6	doggo		pupper		10
7	doggo	floofer			1

	rating_numerator	rating_denominator	img_num	p1_conf	p2_conf	p3_conf	retweet_count	favorite_count	followers_count
count	2175.000000	2173.000000	1994.000000	1994.000000	1.994000e+03	1.994000e+03	1085.000000	1085.000000	1.085000e+03
mean	13.215172	10.481362	1.203109	0.593941	1.344195e-01	6.024848e-02	2725.217512	8571.143779	3.200945e+06
std	47.725696	6.963880	0.560777	0.271954	1.006807e-01	5.089067e-02	4843.139916	11774.371354	4.398728e+01
min	0.000000	10.000000	1.000000	0.044333	1.011300e-08	1.740170e-10	2.000000	52.000000	3.200851e+06
25%	10.000000	10.000000	1.000000	0.362857	5.393987e-02	1.619283e-02	592.000000	1855.000000	3.200901e+06
50%	11.000000	10.000000	1.000000	0.587635	1.174550e-01	4.950530e-02	1344.000000	4078.000000	3.200947e+06
75%	12.000000	10.000000	1.000000	0.846285	1.951377e-01	9.159438e-02	3225.000000	11065.000000	3.201001e+06
max	1776.000000	170.000000	4.000000	1.000000	4.880140e-01	2.734190e-01	79515.000000	131075.000000	3.201018e+06

Project: Wrangling WeRateDogs Twitter Dataset¶

Table of Contents¶

Introduction¶

Data Gathering¶

Assessing Data¶

Quality issues¶

archive_df Table¶

additional_tweets_df Table:¶

Tidiness Issues¶

archives_df Table¶

Predictions Table¶

additional_tweets_df¶

Cleaning Data¶

Make copies of original pieces of data¶

Quality issues¶

Issue #1: archives_df - duplicate tweets inform of retweets¶

Define:¶

Code¶

Test¶

Issue #2: tweet_id is an integer instead of a string object in all 3 dataframes¶

Define¶

Code¶

Test:¶

Issue #3: archives_df - timestamp is a string object¶

Define¶

Code¶

Test¶

Issue #4: archives_df - in_reply_to_user_id, in_reply_to_status_id, retweeted_status_timestamp, retweeted_status_id and retweeted_status_user_id columns have null values¶

Define¶

Code¶

Test¶

Issue #5 archives_df: The source of the tweet is embedded in html tags in the source column¶

Define¶

Code¶

Test¶

Issue #6 archives_df: missing name represented as None¶

Define¶

Code¶

Test¶

Issue #7 archives_df: some names were extracted incorrectly¶

Define¶

Test¶

Issue #8 archives_df - rating_denominator is greater than or less than 10 in some cases¶

Define¶

Code¶

Test¶

Issue #9. created_at column in the additional_tweets_df is a duplicate of the timestamp column in archives_df¶

Define¶

Code¶

Test¶

Tidiness¶

Issue #1 The doggo, floofer, pupper and puppo identifiers should unpivoted into a single column¶

Define¶

Code¶

Test¶

Cleaning issue #No.2 Infomation about a single type of observational spread over three different tables¶

Define¶

Code¶

Test¶

Storing Data¶

Analyzing and Visualizing Data¶

Questions to investigate¶

Research Question 1. Which day of the week that most of the tweets were sent out?¶

Insight 1:¶

Research Question 2. How do tweets that were sent out in the morning compare to tweets sent out in the afternoon?¶

Insight 2.¶

Research Question 3. what is the most common dog name in the dataset?¶

Insight 3:¶