Read CSV file – Python

In [2]:
import pandas as pd
import os
os.getcwd()
tendulkar = pd.read_csv("/dbfs/FileStore/tables/tendulkar.csv", header='infer')
tendulkar.shape
#os.listdir('dbfs/FileStore/tables/')
Out[25]: (347, 12)

Read CSV file – Pyspark

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext 
conf = SparkConf().setAppName("CDI Transform")
sc = SparkContext.getOrCreate()
sqlContext=SQLContext(sc)

tendulkar1= (sqlContext
         .read.format("com.databricks.spark.csv")
         .options(delimiter=',', header='true', inferschema='true')
         .load("/FileStore/tables/tendulkar.csv"))

Data frame Shape -Python

In [6]:
tendulkar.shape
Out[6]: (347, 12)

Data frame Shape -Pyspark

In [8]:
tendulkar1.count()
len(tendulkar1.columns)
def dfShape(df):
  return(df.count(),len(df.columns))

dfShape(tendulkar1)
Out[7]: (347, 12)

Data frame columns – Python

In [10]:
tendulkar.columns
Out[8]: Index([u’Runs’, u’Mins’, u’BF’, u’4s’, u’6s’, u’SR’, u’Pos’, u’Dismissal’, u’Inns’, u’Opposition’, u’Ground’, u’Start Date’], dtype=’object’)
In [11]:
tendulkar.columns
Out[24]: Index([u’Runs’, u’Mins’, u’BF’, u’4s’, u’6s’, u’SR’, u’Pos’, u’Dismissal’, u’Inns’, u’Opposition’, u’Ground’, u’Start Date’], dtype=’object’)

Data frame columns – Pyspark

In [13]:
tendulkar1.columns
Out[25]: [‘Runs’, ‘Mins’, ‘BF’, ‘4s’, ‘6s’, ‘SR’, ‘Pos’, ‘Dismissal’, ‘Inns’, ‘Opposition’, ‘Ground’, ‘Start Date’]

Dtypes – Python

In [15]:
tendulkar.dtypes
Out[26]: Runs object Mins object BF object 4s object 6s object SR object Pos object Dismissal object Inns object Opposition object Ground object Start Date object dtype: object

Dtypes -Pyspark

In [17]:
tendulkar1.dtypes
Out[27]: [(‘Runs’, ‘string’), (‘Mins’, ‘string’), (‘BF’, ‘string’), (‘4s’, ‘string’), (‘6s’, ‘string’), (‘SR’, ‘string’), (‘Pos’, ‘string’), (‘Dismissal’, ‘string’), (‘Inns’, ‘string’), (‘Opposition’, ‘string’), (‘Ground’, ‘string’), (‘Start Date’, ‘string’)]
In [18]:
tendulkar1.printSchema
Out[9]: <bound method DataFrame.printSchema of DataFrame[Runs: string, Mins: string, BF: string, 4s: string, 6s: string, SR: string, Pos: string, Dismissal: string, Inns: string, Opposition: string, Ground: string, Start Date: string]>

Select columns – Python

In [20]:
df=tendulkar[['Runs','Mins','BF']]
df.head(10)
Out[12]: Runs Mins BF 0 15 28 24 1 DNB – – 2 59 254 172 3 8 24 16 4 41 124 90 5 35 74 51 6 57 193 134 7 0 1 1 8 24 50 44 9 88 324 266

Select columns – Pyspark

In [22]:
df1=tendulkar1.select('Runs','Mins','BF')
df1.show(10)
+—-+—-+—+ Runs|Mins| BF| +—-+—-+—+ 15| 28| 24| DNB| -| -| 59| 254|172| 8| 24| 16| 41| 124| 90| 35| 74| 51| 57| 193|134| 0| 1| 1| 24| 50| 44| 88| 324|266| +—-+—-+—+ only showing top 10 rows

Filter rows by criteria – Python

In [24]:
b = tendulkar['Runs'] >50
df = tendulkar[b]
df.head(10)
Out[14]: Runs Mins BF 4s 6s SR Pos Dismissal Inns Opposition Ground \ 0 15 28 24 2 0 62.5 6 bowled 2 v Pakistan Karachi 1 DNB – – – – – – – 4 v Pakistan Karachi 2 59 254 172 4 0 34.3 6 lbw 1 v Pakistan Faisalabad 3 8 24 16 1 0 50 6 run out 3 v Pakistan Faisalabad 4 41 124 90 5 0 45.55 7 bowled 1 v Pakistan Lahore 5 35 74 51 5 0 68.62 6 lbw 1 v Pakistan Sialkot 6 57 193 134 6 0 42.53 6 caught 3 v Pakistan Sialkot 7 0 1 1 0 0 0 6 caught 2 v New Zealand Christchurch 8 24 50 44 3 0 54.54 6 caught 3 v New Zealand Christchurch 9 88 324 266 5 0 33.08 6 caught 1 v New Zealand Napier Start Date 0 15-Nov-89 1 15-Nov-89 2 23-Nov-89 3 23-Nov-89 4 1-Dec-89 5 9-Dec-89 6 9-Dec-89 7 2-Feb-90 8 2-Feb-90 9 9-Feb-90

Filtering by condition – Pyspark

In [26]:
df1=tendulkar1.filter(tendulkar1['Runs']>50)
df1.show(10)
+—-+—-+—+—+—+—–+—+———+—-+————–+————+———-+ Runs|Mins| BF| 4s| 6s| SR|Pos|Dismissal|Inns| Opposition| Ground|Start Date| +—-+—-+—+—+—+—–+—+———+—-+————–+————+———-+ 59| 254|172| 4| 0| 34.3| 6| lbw| 1| v Pakistan| Faisalabad| 23-Nov-89| 57| 193|134| 6| 0|42.53| 6| caught| 3| v Pakistan| Sialkot| 9-Dec-89| 88| 324|266| 5| 0|33.08| 6| caught| 1| v New Zealand| Napier| 9-Feb-90| 68| 216|136| 8| 0| 50| 6| caught| 2| v England| Manchester| 9-Aug-90| 114| 228|161| 16| 0| 70.8| 4| caught| 2| v Australia| Perth| 1-Feb-92| 111| 373|270| 19| 0|41.11| 4| caught| 2|v South Africa|Johannesburg| 26-Nov-92| 73| 272|208| 8| 1|35.09| 5| caught| 2|v South Africa| Cape Town| 2-Jan-93| 165| 361|296| 24| 1|55.74| 4| caught| 1| v England| Chennai| 11-Feb-93| 78| 285|213| 10| 0|36.61| 4| lbw| 2| v England| Mumbai| 19-Feb-93| 62| 128|114| 7| 0|54.38| 4| caught| 1| v Zimbabwe| Delhi| 13-Mar-93| +—-+—-+—+—+—+—–+—+———+—-+————–+————+———-+ only showing top 10 rows

Display unique contents of a column – Python

In [28]:
tendulkar = pd.read_csv("/dbfs/FileStore/tables/tendulkar.csv", header='infer')
tendulkar['Runs'].unique()
Out[64]: array([’15’, ‘DNB’, ’59’, ‘8’, ’41’, ’35’, ’57’, ‘0’, ’24’, ’88’, ‘5’, ’10’, ’27’, ’68’, ‘119*’, ’21’, ’11’, ’16’, ‘7’, ’40’, ‘148*’, ‘6’, ’17’, ‘114’, ‘111’, ‘1’, ’73’, ’50’, ‘9*’, ‘165’, ’78’, ’62’, ‘TDNB’, ’28’, ‘104*’, ’71’, ‘142’, ’96’, ’43’, ’11*’, ’34’, ’85’, ‘179’, ’54’, ‘4’, ‘0*’, ’52*’, ‘2’, ‘122’, ’31’, ‘177’, ’74’, ’42’, ’18’, ’61’, ’36’, ‘169’, ‘9’, ’15*’, ’92’, ’83’, ‘143’, ‘139’, ’23’, ‘148’, ’13’, ‘155*’, ’79’, ’47’, ‘113’, ’67’, ‘136’, ’29’, ’53’, ‘124*’, ‘126*’, ’44*’, ‘217’, ‘116’, ’52’, ’45’, ’97’, ’20’, ’39’, ‘201*’, ’76’, ’65’, ‘126’, ’36*’, ’69’, ‘155’, ’22*’, ‘103’, ’26’, ’90’, ‘176’, ‘117’, ’86’, ’12’, ‘193’, ’16*’, ’51’, ’32’, ’55’, ’37’, ’44’, ‘241*’, ’60*’, ‘194*’, ‘3’, ’32*’, ‘248*’, ’94’, ’22’, ‘109’, ’19’, ’14’, ’28*’, ’63’, ’64’, ‘101’, ‘122*’, ’91’, ’82’, ’56*’, ‘154*’, ‘153’, ’49’, ’10*’, ‘103*’, ‘160’, ‘100*’, ‘105*’, ‘100’, ‘106’, ’84’, ‘203’, ’98’, ’38’, ‘214’, ’53*’, ‘111*’, ‘146’, ’14*’, ’56’, ’80’, ’25’, ’81’, ’13*’], dtype=object)

Display unique contents of a column – Pyspark

In [30]:
from pyspark.sql.functions import *
tendulkar1= (sqlContext
         .read.format("com.databricks.spark.csv")
         .options(delimiter=',', header='true', inferschema='true')
         .load("/FileStore/tables/tendulkar.csv"))
tendulkar1.select('Runs').rdd.distinct().collect()
Out[75]: [Row(Runs=u’24’), Row(Runs=u’148*’), Row(Runs=u’100*’), Row(Runs=u’0′), Row(Runs=u’16’), Row(Runs=u’80’), Row(Runs=u’146′), Row(Runs=u’19’), Row(Runs=u’96’), Row(Runs=u’16*’), Row(Runs=u’76’), Row(Runs=u’51’), Row(Runs=u’88’), Row(Runs=u’9′), Row(Runs=u’35’), Row(Runs=u’64’), Row(Runs=u’41’), Row(Runs=u’11’), Row(Runs=u’25’), Row(Runs=u’59’), Row(Runs=u’1′), Row(Runs=u’49’), Row(Runs=u’91’), Row(Runs=u’81’), Row(Runs=u’122*’), Row(Runs=u’32*’), Row(Runs=u’18’), Row(Runs=u’60*’), Row(Runs=u’126*’), Row(Runs=u’DNB’), Row(Runs=u’53*’), Row(Runs=u’116′), Row(Runs=u’34’), Row(Runs=u’65’), Row(Runs=u’10’), Row(Runs=u’26’), Row(Runs=u’13*’), Row(Runs=u’36*’), Row(Runs=u’165′), Row(Runs=u’TDNB’), Row(Runs=u’139′), Row(Runs=u’68’), Row(Runs=u’27’), Row(Runs=u’217′), Row(Runs=u’86’), Row(Runs=u’6′), Row(Runs=u’203′), Row(Runs=u’47’), Row(Runs=u’98’), Row(Runs=u’74’), Row(Runs=u’53’), Row(Runs=u’111′), Row(Runs=u’105*’), Row(Runs=u’179′), Row(Runs=u’154*’), Row(Runs=u’13’), Row(Runs=u’28*’), Row(Runs=u’50’), Row(Runs=u’248*’), Row(Runs=u’7′), Row(Runs=u’14*’), Row(Runs=u’214′), Row(Runs=u’32’), Row(Runs=u’44’), Row(Runs=u’28’), Row(Runs=u’56*’), Row(Runs=u’52*’), Row(Runs=u’73’), Row(Runs=u’52’), Row(Runs=u’103*’), Row(Runs=u’126′), Row(Runs=u’67’), Row(Runs=u’12’), Row(Runs=u’20’), Row(Runs=u’201*’), Row(Runs=u’177′), Row(Runs=u’119*’), Row(Runs=u’103′), Row(Runs=u’4′), Row(Runs=u’10*’), Row(Runs=u’92’), Row(Runs=u’55’), Row(Runs=u’84’), Row(Runs=u’142′), Row(Runs=u’31’), Row(Runs=u’45’), Row(Runs=u’44*’), Row(Runs=u’111*’), Row(Runs=u’29’), Row(Runs=u’0*’), Row(Runs=u’113′), Row(Runs=u’39’), Row(Runs=u’241*’), Row(Runs=u’15’), Row(Runs=u’21’), Row(Runs=u’69’), Row(Runs=u’117′), Row(Runs=u’193′), Row(Runs=u’176′), Row(Runs=u’5′), Row(Runs=u’100′), Row(Runs=u’15*’), Row(Runs=u’160′), Row(Runs=u’153′), Row(Runs=u’9*’), Row(Runs=u’79’), Row(Runs=u’54’), Row(Runs=u’85’), Row(Runs=u’143′), Row(Runs=u’124*’), Row(Runs=u’61’), Row(Runs=u’42’), Row(Runs=u’136′), Row(Runs=u’71’), Row(Runs=u’38’), Row(Runs=u’14’), Row(Runs=u’22’), Row(Runs=u’82’), Row(Runs=u’2′), Row(Runs=u’148′), Row(Runs=u’11*’), Row(Runs=u’101′), Row(Runs=u’94’), Row(Runs=u’78’), Row(Runs=u’57’), Row(Runs=u’90’), Row(Runs=u’37’), Row(Runs=u’62’), Row(Runs=u’43’), Row(Runs=u’109′), Row(Runs=u’104*’), Row(Runs=u’169′), Row(Runs=u’155*’), Row(Runs=u’194*’), Row(Runs=u’17’), Row(Runs=u’23’), Row(Runs=u’3′), Row(Runs=u’83’), Row(Runs=u’106′), Row(Runs=u’97’), Row(Runs=u’155′), Row(Runs=u’22*’), Row(Runs=u’56’), Row(Runs=u’114′), Row(Runs=u’122′), Row(Runs=u’36’), Row(Runs=u’63’), Row(Runs=u’40’), Row(Runs=u’8′)]

Aggregate mean, max, min – Python

In [32]:
#tendulkar[['Mins','BF','Runs']].groupby('Runs').mean()
# Remove rows which have DNB
import pandas as pd
import os
os.getcwd()
tendulkar = pd.read_csv("/dbfs/FileStore/tables/tendulkar.csv", header='infer')
tendulkar.shape
a=tendulkar.Runs !="DNB"
tendulkar=tendulkar[a]
tendulkar.shape

# Remove rows which have TDNB
b=tendulkar.Runs !="TDNB"
tendulkar=tendulkar[b]
tendulkar.shape

# Remove the '*' character
c= tendulkar.BF != "-"
tendulkar=tendulkar[c]
tendulkar.Runs= tendulkar.Runs.str.replace(r"[*]","")

#tendulkar.shape
type(tendulkar['Runs'][0])
tendulkar['Runs']=pd.to_numeric(tendulkar['Runs'])
tendulkar['BF']=pd.to_numeric(tendulkar['BF'])
df=tendulkar[['Runs','BF','Ground']].groupby('Ground').agg(['mean','min','max'])
df.head(10)
Out[34]: Runs BF mean min max mean min max Ground Adelaide 32.600000 0 153 58.400000 5 205 Ahmedabad 40.125000 4 217 80.062500 3 344 Auckland 5.000000 5 5 13.000000 13 13 Bangalore 54.312500 0 214 98.687500 0 363 Birmingham 46.750000 1 122 71.500000 8 177 Bloemfontein 85.000000 15 155 109.500000 35 184 Bridgetown 26.000000 0 92 45.000000 2 147 Brisbane 7.666667 0 16 23.333333 3 42 Bulawayo 55.000000 36 74 88.500000 49 128 Cape Town 69.857143 9 169 155.285714 28 314

Aggregate mean,min max – Pyspark

In [34]:
from pyspark.sql.functions import *
tendulkar1= (sqlContext
         .read.format("com.databricks.spark.csv")
         .options(delimiter=',', header='true', inferschema='true')
         .load("/FileStore/tables/tendulkar.csv"))
tendulkar1= tendulkar1.where(tendulkar1['Runs'] != 'DNB')

print(dfShape(tendulkar1))
tendulkar1= tendulkar1.where(tendulkar1['Runs'] != 'TDNB')
print(dfShape(tendulkar1))
tendulkar1 = tendulkar1.withColumn('Runs', regexp_replace('Runs', '[*]', ''))
tendulkar1.select('Runs').rdd.distinct().collect()

from pyspark.sql import functions as F
df=tendulkar1[['Runs','BF','Ground']].groupby(tendulkar1['Ground']).agg(F.mean(tendulkar1['Runs']),F.min(tendulkar1['Runs']),F.max(tendulkar1['Runs']))
df.show()
(330, 12) (329, 12) +————-+—————–+———+———+ Ground| avg(Runs)|min(Runs)|max(Runs)| +————-+—————–+———+———+ Bangalore| 54.3125| 0| 96| Adelaide| 32.6| 0| 61| Colombo (PSS)| 37.2| 14| 71| Christchurch| 12.0| 0| 24| Auckland| 5.0| 5| 5| Chennai| 60.625| 0| 81| Centurion| 73.5| 111| 36| Brisbane|7.666666666666667| 0| 7| Birmingham| 46.75| 1| 40| Ahmedabad| 40.125| 100| 8| Colombo (RPS)| 143.0| 143| 143| Chittagong| 57.8| 101| 36| Cape Town|69.85714285714286| 14| 9| Bridgetown| 26.0| 0| 92| Bulawayo| 55.0| 36| 74| Delhi|39.94736842105263| 0| 76| Chandigarh| 11.0| 11| 11| Bloemfontein| 85.0| 15| 155| Colombo (SSC)|77.55555555555556| 104| 8| Cuttack| 2.0| 2| 2| +————-+—————–+———+———+ only showing top 20 rows
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: