import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


DfCountrySerie= pd.read_csv("/content/drive/MyDrive/Datasets/EducatifData/EdStatsCountry-Series.csv")
DfCountry= pd.read_csv("/content/drive/MyDrive/Datasets/EducatifData/EdStatsCountry.csv")
DfData= pd.read_csv("/content/drive/MyDrive/Datasets/EducatifData/EdStatsData.csv")
DfFootNote= pd.read_csv("/content/drive/MyDrive/Datasets/EducatifData/EdStatsFootNote.csv")
DfSeries= pd.read_csv("/content/drive/MyDrive/Datasets/EducatifData/EdStatsSeries.csv")


print("CountrySerie:",DfCountrySerie.shape)
print("Country:",DfCountry.shape)
print("Data:",DfData.shape)
print("Series:",DfSeries.shape)

CountrySerie: (613, 4)
Country: (241, 32)
Data: (886930, 70)
Series: (3665, 21)


DfCountrySerie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CountryCode  613 non-null    object 
 1   SeriesCode   613 non-null    object 
 2   DESCRIPTION  613 non-null    object 
 3   Unnamed: 3   0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 19.3+ KB


DfCountry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 32 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Country Code                                       241 non-null    object 
 1   Short Name                                         241 non-null    object 
 2   Table Name                                         241 non-null    object 
 3   Long Name                                          241 non-null    object 
 4   2-alpha code                                       238 non-null    object 
 5   Currency Unit                                      215 non-null    object 
 6   Special Notes                                      145 non-null    object 
 7   Region                                             214 non-null    object 
 8   Income Group                                       214 non-null    object 
 9   WB-2 code                                          240 non-null    object 
 10  National accounts base year                        205 non-null    object 
 11  National accounts reference year                   32 non-null     float64
 12  SNA price valuation                                197 non-null    object 
 13  Lending category                                   144 non-null    object 
 14  Other groups                                       58 non-null     object 
 15  System of National Accounts                        215 non-null    object 
 16  Alternative conversion factor                      47 non-null     object 
 17  PPP survey year                                    145 non-null    object 
 18  Balance of Payments Manual in use                  181 non-null    object 
 19  External debt Reporting status                     124 non-null    object 
 20  System of trade                                    200 non-null    object 
 21  Government Accounting concept                      161 non-null    object 
 22  IMF data dissemination standard                    181 non-null    object 
 23  Latest population census                           213 non-null    object 
 24  Latest household survey                            141 non-null    object 
 25  Source of most recent Income and expenditure data  160 non-null    object 
 26  Vital registration complete                        111 non-null    object 
 27  Latest agricultural census                         142 non-null    object 
 28  Latest industrial data                             107 non-null    float64
 29  Latest trade data                                  185 non-null    float64
 30  Latest water withdrawal data                       179 non-null    object 
 31  Unnamed: 31                                        0 non-null      float64
dtypes: float64(4), object(28)
memory usage: 60.4+ KB


DfData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886930 entries, 0 to 886929
Data columns (total 70 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Name    886930 non-null  object 
 1   Country Code    886930 non-null  object 
 2   Indicator Name  886930 non-null  object 
 3   Indicator Code  886930 non-null  object 
 4   1970            72288 non-null   float64
 5   1971            35537 non-null   float64
 6   1972            35619 non-null   float64
 7   1973            35545 non-null   float64
 8   1974            35730 non-null   float64
 9   1975            87306 non-null   float64
 10  1976            37483 non-null   float64
 11  1977            37574 non-null   float64
 12  1978            37576 non-null   float64
 13  1979            36809 non-null   float64
 14  1980            89122 non-null   float64
 15  1981            38777 non-null   float64
 16  1982            37511 non-null   float64
 17  1983            38460 non-null   float64
 18  1984            38606 non-null   float64
 19  1985            90296 non-null   float64
 20  1986            39372 non-null   float64
 21  1987            38641 non-null   float64
 22  1988            38552 non-null   float64
 23  1989            37540 non-null   float64
 24  1990            124405 non-null  float64
 25  1991            74437 non-null   float64
 26  1992            75543 non-null   float64
 27  1993            75793 non-null   float64
 28  1994            77462 non-null   float64
 29  1995            131361 non-null  float64
 30  1996            76807 non-null   float64
 31  1997            73453 non-null   float64
 32  1998            84914 non-null   float64
 33  1999            118839 non-null  float64
 34  2000            176676 non-null  float64
 35  2001            123509 non-null  float64
 36  2002            124205 non-null  float64
 37  2003            130363 non-null  float64
 38  2004            128814 non-null  float64
 39  2005            184108 non-null  float64
 40  2006            140312 non-null  float64
 41  2007            137272 non-null  float64
 42  2008            134387 non-null  float64
 43  2009            142108 non-null  float64
 44  2010            242442 non-null  float64
 45  2011            146012 non-null  float64
 46  2012            147264 non-null  float64
 47  2013            137509 non-null  float64
 48  2014            113789 non-null  float64
 49  2015            131058 non-null  float64
 50  2016            16460 non-null   float64
 51  2017            143 non-null     float64
 52  2020            51436 non-null   float64
 53  2025            51436 non-null   float64
 54  2030            51436 non-null   float64
 55  2035            51436 non-null   float64
 56  2040            51436 non-null   float64
 57  2045            51436 non-null   float64
 58  2050            51436 non-null   float64
 59  2055            51436 non-null   float64
 60  2060            51436 non-null   float64
 61  2065            51436 non-null   float64
 62  2070            51436 non-null   float64
 63  2075            51436 non-null   float64
 64  2080            51436 non-null   float64
 65  2085            51436 non-null   float64
 66  2090            51436 non-null   float64
 67  2095            51436 non-null   float64
 68  2100            51436 non-null   float64
 69  Unnamed: 69     0 non-null       float64
dtypes: float64(66), object(4)
memory usage: 473.7+ MB


DfFootNote.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643638 entries, 0 to 643637
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   CountryCode  643638 non-null  object 
 1   SeriesCode   643638 non-null  object 
 2   Year         643638 non-null  object 
 3   DESCRIPTION  643638 non-null  object 
 4   Unnamed: 4   0 non-null       float64
dtypes: float64(1), object(4)
memory usage: 24.6+ MB


DfSeries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3665 entries, 0 to 3664
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Series Code                          3665 non-null   object 
 1   Topic                                3665 non-null   object 
 2   Indicator Name                       3665 non-null   object 
 3   Short definition                     2156 non-null   object 
 4   Long definition                      3665 non-null   object 
 5   Unit of measure                      0 non-null      float64
 6   Periodicity                          99 non-null     object 
 7   Base Period                          314 non-null    object 
 8   Other notes                          552 non-null    object 
 9   Aggregation method                   47 non-null     object 
 10  Limitations and exceptions           14 non-null     object 
 11  Notes from original source           0 non-null      float64
 12  General comments                     14 non-null     object 
 13  Source                               3665 non-null   object 
 14  Statistical concept and methodology  23 non-null     object 
 15  Development relevance                3 non-null      object 
 16  Related source links                 215 non-null    object 
 17  Other web links                      0 non-null      float64
 18  Related indicators                   0 non-null      float64
 19  License Type                         0 non-null      float64
 20  Unnamed: 20                          0 non-null      float64
dtypes: float64(6), object(15)
memory usage: 601.4+ KB


print("|--------Country--------------|>")
display(DfCountry.head(1))
print("|--------CountrySerie----------|>")
display(DfCountrySerie.head(1))
print("|-------------Data-------------|>")
display(DfData.head(1))
print("|----------FootNote------------|>")
display(DfFootNote.head(1))
print("|------------Series----------|>")
display(DfSeries.head(1))

|--------Country--------------|>

|--------CountrySerie----------|>

|-------------Data-------------|>

|----------FootNote------------|>

|------------Series----------|>


print(DfCountry.isnull().sum().sum()," missing values out of",DfCountry.isnull().sum().sum()+DfCountry.notna().sum().sum())
print(DfCountrySerie.isnull().sum().sum()," missing values out of",DfCountrySerie.isnull().sum().sum()+DfCountrySerie.notna().sum().sum())
print(DfData.isnull().sum().sum()," missing values out of",DfData.isnull().sum().sum()+DfData.notna().sum().sum())
print(DfFootNote.isnull().sum().sum()," missing values out of",DfFootNote.isnull().sum().sum()+DfFootNote.notna().sum().sum())
print(DfSeries.isnull().sum().sum()," missing values out of",DfSeries.isnull().sum().sum()+DfSeries.notna().sum().sum())

2354  missing values out of 7712
613  missing values out of 2452
53455179  missing values out of 62085100
643638  missing values out of 3218190
55203  missing values out of 76965


print("DfCountry->",np.round(DfCountry.isnull().sum().sum()*100/(DfCountry.isnull().sum().sum()+DfCountry.notna().sum().sum())),"% of missing data")
print("DfCountrySerie->",np.round(DfCountrySerie.isnull().sum().sum()*100/(DfCountrySerie.isnull().sum().sum()+DfCountrySerie.notna().sum().sum())),"% of missing data")
print("DfData->",np.round(DfData.isnull().sum().sum()*100/(DfData.isnull().sum().sum()+DfData.notna().sum().sum())),"% of missing data")
print("DfFootNote->",np.round(DfFootNote.isnull().sum().sum()*100/(DfFootNote.isnull().sum().sum()+DfFootNote.notna().sum().sum())),"% of missing data")
print("DfSeries->",np.round(DfSeries.isnull().sum().sum()*100/(DfSeries.isnull().sum().sum()+DfSeries.notna().sum().sum())),"% of missing data")

DfCountry-> 31.0 % of missing data
DfCountrySerie-> 25.0 % of missing data
DfData-> 86.0 % of missing data
DfFootNote-> 20.0 % of missing data
DfSeries-> 72.0 % of missing data


ListOfIndicator=DfData["Indicator Name"].unique().tolist()


DfCountry.columns.unique

<bound method Index.unique of Index(['Country Code', 'Short Name', 'Table Name', 'Long Name', '2-alpha code',
       'Currency Unit', 'Special Notes', 'Region', 'Income Group', 'WB-2 code',
       'National accounts base year', 'National accounts reference year',
       'SNA price valuation', 'Lending category', 'Other groups',
       'System of National Accounts', 'Alternative conversion factor',
       'PPP survey year', 'Balance of Payments Manual in use',
       'External debt Reporting status', 'System of trade',
       'Government Accounting concept', 'IMF data dissemination standard',
       'Latest population census', 'Latest household survey',
       'Source of most recent Income and expenditure data',
       'Vital registration complete', 'Latest agricultural census',
       'Latest industrial data', 'Latest trade data',
       'Latest water withdrawal data', 'Unnamed: 31'],
      dtype='object')>


DfCountry["Income Group"].unique()

array(['High income: nonOECD', 'Low income', 'Upper middle income', nan,
       'Lower middle income', 'High income: OECD'], dtype=object)


DfCountriesWithGoodIncome=DfCountry[["Short Name","Income Group"]][DfCountry[["Short Name","Income Group"]]["Income Group"].isin(['Upper middle income'])]


ListOfCountry=DfCountriesWithGoodIncome['Short Name'].unique().tolist()


Dfs=DfData.drop(DfData.iloc[:,4:35].columns.tolist(),axis=1).copy()


Dfs.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2020',
       '2025', '2030', '2035', '2040', '2045', '2050', '2055', '2060', '2065',
       '2070', '2075', '2080', '2085', '2090', '2095', '2100', 'Unnamed: 69'],
      dtype='object')


Dfs.drop(["Indicator Code","Country Code"],axis=1,inplace=True)


Dfs.drop(Dfs.iloc[:,2:11],axis=1,inplace=True)


Dfs.drop(Dfs.iloc[:,8:],axis=1,inplace=True)


Dfs.head(5)


DFTransformed=Dfs[Dfs['Country Name'].isin(ListOfCountry)]


for elem in DFTransformed['Indicator Name'].unique().tolist():
  if elem.find('Internet')!=-1:
    print(elem)

Internet users (per 100 people)


for elem in DFTransformed['Indicator Name'].unique().tolist():
  if elem.find('tertiary')!=-1:
    if elem.find('Enrolment')!=-1:
      if elem.find('sexes')!=-1:
        print(elem)

Enrolment in post-secondary non-tertiary education, both sexes (number)
Enrolment in post-secondary non-tertiary education, private institutions, both sexes (number)
Enrolment in post-secondary non-tertiary education, public institutions, both sexes (number)
Enrolment in tertiary education per 100,000 inhabitants, both sexes
Enrolment in tertiary education, all programmes, both sexes (number)
Enrolment in tertiary education, ISCED 5 programmes, both sexes (number)
Enrolment in tertiary education, ISCED 6 programmes, both sexes (number)
Enrolment in tertiary education, ISCED 7 programmes, both sexes (number)
Enrolment in tertiary education, ISCED 8 programmes, both sexes (number)


for elem in DFTransformed['Indicator Name'].unique().tolist():
  if elem.find('secondary')!=-1:
    if elem.find('Enrolment')!=-1:
      if elem.find('sexes')!=-1:
        print(elem)

Enrolment in Grade 1 of lower secondary general education, both sexes (number)
Enrolment in Grade 2 of lower secondary general education, both sexes (number)
Enrolment in Grade 3 of lower secondary general education, both sexes (number)
Enrolment in Grade 4 of lower secondary general education, both sexes (number)
Enrolment in Grade 5 of lower secondary general education, both sexes (number)
Enrolment in Grade 6 of lower secondary general education, both sexes (number)
Enrolment in lower secondary education, both sexes (number)
Enrolment in lower secondary education, private institutions, both sexes (number)
Enrolment in lower secondary education, public institutions, both sexes (number)
Enrolment in lower secondary general education, Grade unspecified, both sexes (number)
Enrolment in lower secondary general, both sexes (number)
Enrolment in lower secondary vocational, both sexes (number)
Enrolment in post-secondary non-tertiary education, both sexes (number)
Enrolment in post-secondary non-tertiary education, private institutions, both sexes (number)
Enrolment in post-secondary non-tertiary education, public institutions, both sexes (number)
Enrolment in secondary education, both sexes (number)
Enrolment in secondary education, private institutions, both sexes (number)
Enrolment in secondary education, public institutions, both sexes (number)
Enrolment in secondary general, both sexes (number)
Enrolment in secondary vocational, both sexes (number)
Enrolment in upper secondary education, both sexes (number)
Enrolment in upper secondary education, private institutions, both sexes (number)
Enrolment in upper secondary education, public institutions, both sexes (number)
Enrolment in upper secondary general, both sexes (number)
Enrolment in upper secondary vocational, both sexes (number)


DFTransformed.shape

(190580, 8)


DFTransformed.dropna(inplace=True)

/usr/local/lib/python3.8/dist-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


DFTransformed.shape

(12136, 8)


dfCountriesinternet=DFTransformed[DFTransformed['Indicator Name']=="Internet users (per 100 people)"]['Country Name'].unique()
len(dfCountriesinternet)

49


dfCountriesSecondary=DFTransformed[DFTransformed['Indicator Name']=="Enrolment in secondary general, both sexes (number)"]['Country Name'].unique()
len(dfCountriesSecondary)

28


dfCountriesTertiary=DFTransformed[DFTransformed['Indicator Name']=="Enrolment in tertiary education, all programmes, both sexes (number)"]['Country Name'].unique()
len(dfCountriesTertiary)

22


list(set(dfCountriesSecondary)-set(dfCountriesTertiary))

['Seychelles',
 'Ecuador',
 'Dominican Republic',
 'Peru',
 'Costa Rica',
 'Suriname',
 'Belize']


list(set(dfCountriesTertiary)-set(dfCountriesSecondary))

['Botswana']


DFTransformed.drop( DFTransformed[DFTransformed['Country Name'].isin(list(set(dfCountriesSecondary)-set(dfCountriesTertiary)))].index,inplace=True)

/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


len(DFTransformed['Country Name'].unique())

45


DFTransformed.drop( DFTransformed[DFTransformed['Country Name'].isin(list(set(dfCountriesTertiary)-set(dfCountriesSecondary)))].index,inplace=True)


len(DFTransformed['Country Name'].unique())

44


DFTransformed["Country Name"].unique()

array(['Albania', 'Algeria', 'American Samoa', 'Angola', 'Argentina',
       'Azerbaijan', 'Belarus', 'Bosnia and Herzegovina', 'Brazil',
       'Bulgaria', 'China', 'Colombia', 'Cuba', 'Dominica', 'Fiji',
       'Gabon', 'Grenada', 'Hungary', 'Iraq', 'Jamaica', 'Jordan',
       'Kazakhstan', 'Lebanon', 'Libya', 'Malaysia', 'Maldives',
       'Marshall Islands', 'Mauritius', 'Mexico', 'Montenegro', 'Namibia',
       'Palau', 'Panama', 'Romania', 'Serbia', 'South Africa',
       'St. Lucia', 'St. Vincent and the Grenadines', 'Thailand', 'Tonga',
       'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu'], dtype=object)


dfCountriesinternet=DFTransformed[DFTransformed['Indicator Name']=="Internet users (per 100 people)"]['Country Name'].unique()
len(dfCountriesinternet)

41


dfCountriesSecondary=DFTransformed[DFTransformed['Indicator Name']=="Enrolment in secondary general, both sexes (number)"]['Country Name'].unique()
len(dfCountriesSecondary)

21


dfCountriesTertiary=DFTransformed[DFTransformed['Indicator Name']=="Enrolment in tertiary education, all programmes, both sexes (number)"]['Country Name'].unique()
len(dfCountriesTertiary)

21


DFTransformed.head(3)


list(set(dfCountriesinternet)-set(dfCountriesSecondary))

['Jamaica',
 'St. Vincent and the Grenadines',
 'Mexico',
 'Jordan',
 'Tuvalu',
 'Argentina',
 'Grenada',
 'Dominica',
 'Marshall Islands',
 'Maldives',
 'Iraq',
 'Montenegro',
 'South Africa',
 'Tonga',
 'Panama',
 'Fiji',
 'Namibia',
 'Gabon',
 'Turkmenistan',
 'Angola']


DFTransformed.drop( DFTransformed[DFTransformed['Country Name'].isin(list(set(dfCountriesinternet)-set(dfCountriesSecondary)))].index,inplace=True)

/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


dfCountriesinternet=DFTransformed[DFTransformed['Indicator Name']=="Internet users (per 100 people)"]['Country Name'].unique()
len(dfCountriesinternet)

21


Top20Internet=DFTransformed[DFTransformed["Indicator Name"]=="Internet users (per 100 people)"].nlargest(20,"2015")["Country Name"].tolist()
print(Top20Internet)

['Azerbaijan', 'Lebanon', 'Kazakhstan', 'Hungary', 'Malaysia', 'Belarus', 'Serbia', 'Bosnia and Herzegovina', 'Albania', 'Brazil', 'Bulgaria', 'Colombia', 'Romania', 'Turkey', 'China', 'Mauritius', 'Tunisia', 'St. Lucia', 'Thailand', 'Algeria']


Top5Tertiary=DFTransformed[DFTransformed["Indicator Name"]=="Enrolment in tertiary education, all programmes, both sexes (number)"].nlargest(5,"2015")["Country Name"].tolist()
Top5Tertiary

['China', 'Brazil', 'Turkey', 'Colombia', 'Thailand']


Top5Secondary=DFTransformed[DFTransformed["Indicator Name"]=="Enrolment in secondary education, both sexes (number)"].nlargest(5,"2015")["Country Name"].tolist()
Top5Secondary

['China', 'Brazil', 'Turkey', 'Thailand', 'Colombia']


set(Top5Tertiary)-set(Top20Internet)

set()


set(Top5Secondary)-set(Top20Internet)

set()

	Country Name	Indicator Name	2010	2011	2012	2013	2014	2015
0	Arab World	Adjusted net enrolment rate, lower secondary, ...	NaN	NaN	NaN	NaN	NaN	NaN
1	Arab World	Adjusted net enrolment rate, lower secondary, ...	NaN	NaN	NaN	NaN	NaN	NaN
2	Arab World	Adjusted net enrolment rate, lower secondary, ...	NaN	NaN	NaN	NaN	NaN	NaN
3	Arab World	Adjusted net enrolment rate, lower secondary, ...	NaN	NaN	NaN	NaN	NaN	NaN
4	Arab World	Adjusted net enrolment rate, primary, both sex...	85.211998	85.24514	86.101669	85.51194	85.320152	NaN

	Country Name	Indicator Name	2010	2011	2012	2013	2014	2015
95992	Albania	Duration of compulsory education (years)	8.0	8.0	9.0	9.0	9.0	9.0
96467	Albania	Enrolment in pre-primary education, both sexes...	74914.0	76389.0	80488.0	81865.0	81448.0	82494.0
96468	Albania	Enrolment in pre-primary education, female (nu...	35340.0	36021.0	38153.0	38939.0	38767.0	39229.0

Main Objectives:¶

1) Observation¶

2)Missing Data¶

Step1) Countries selected by income¶

Step2) Filters selected Countries with Chosen Indicators, and removing useless columns¶

Let's remove all line with Nan¶

Let's filter the countries with the available indicators¶

We remove all the countries missing from one dataframe to another¶

We filter again with the removed countries from secondary and tertiary and now we filter with the indicator internet user¶

The countries China, Brazil, Turkey, Colombia, Thailand are in the top 20¶

countries with high value of Internet users (per 100 people). So They are our top 5 most interesting countries.¶