A collection of cheatsheets,tips,Reference Guide for several data analysis libraries and frameworks
- Pandas
- Pyspark
- Pypolars
- Dask
- DataTables
- etc
- PySpark is an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs, but also provides the PySpark shell for interactively analyzing your data in a distributed environment. PySpark supports most of Spark’s features such as Spark SQL, DataFrame, Streaming, MLlib (Machine Learning) and Spark Core.
- https://spark.apache.org/docs/latest/api/python/reference/index.html
pip install pyspark
from pyspark import SparkContext
# Create a Spark Context
sc = SparkContext(master="local[2]")
sc
from pyspark.sql import SparkSession
# Spark Session used for DF
spark = SparkSession.builder.appName("SampleSparkApp").getOrCreate()
df = spark.read.csv("data/dataset.csv")
df.show(5)
df = spark.read.csv("data/dataset.csv",header=True)
df.show(5)
df = spark.read.csv("data/dataset.csv",header=True,infer_schema=True)
df.show(5)
df.first()
df.head(5)
df.columns
df.dtypes
df.printSchema()
df.count()
len(df.columns)
print(df.count(),len(df.columns))
df.describe().show()
df.describe('age').show()
df.select('Col1').show()
df.select('COL1').show()
df.select('col1').show()
df.select('Age','Category').show()
- Using Bracket notation only prints the name of the column no the data within the column
df.Age
# Method 1: Using Filter
df.filter(df['Age'] == 25).show()
# or
df.filter(df.Age == 25).show()
# Filter on ==,>, <, >=, <= condition
df = df.filter(df.Age > 25)
- It requires parentheses around each condition
df = df.filter((df.Age > 25) & (df.Gender == 'M'))
df = df.filter(col('first_name').isin([3, 4, 7]))
df.orderBy(df.age.asc())
df.orderBy(df.age.desc())
# Where: Method 1
df.where(df['sex'] == 'f').show()
# Filter and show only a selected column
df.where(df['sex'] == 'f').select('Age','Sex','Category').show(5)
# Add a Column
df.withColumn('Alb_by_10',df['ALB'] * 10).show()
# Persist the addition
df2 = df.withColumn('Alb_by_10',df['ALB'] * 10)
# Rename a column
df = df.withColumnRenamed('dob', 'date_of_birth')
# Select the columns to keep, optionally rename some
df = df.select(
'name',
'age',
F.col('dob').alias('date_of_birth'),)
# Remove columns
df = df.drop('monthly_salary', 'salary_for_year')
# Keep all the columns which also occur in another dataset
df = df.select(*(F.col(c) for c in df2.columns))
# Batch Rename/Clean Columns
for col in df.columns:
df = df.withColumnRenamed(col, col.lower().replace(' ', '_').replace('-', '_'))
- PyPolars is an open-source Python data frame library similar to Pandas. PyPolars utilizes all the available cores of the CPU and hence performs the computations faster
- Ideally, PyPolars is used when the data is too big for Pandas and too small for Spark
- https://github.com/pola-rs/polars
- Eager API: it is very similar to that of Pandas, and the results are produced just after the execution is completed.
- Lazy API: it is very similar to Spark, where a map or plan is formed upon execution of a query. Then the execution is executed parallelly across all the cores of the CPU.
pip install py-polars
import pypolars as pl
df = pl.read_csv("data/diamonds.csv")
type(df)
pypolars.frame.DataFrame
df.head()
df.tail(10)
df.shape
df.dtypes
[pypolars.datatypes.Float64,
pypolars.datatypes.Utf8,
pypolars.datatypes.Utf8,
pypolars.datatypes.Utf8,
pypolars.datatypes.Float64,
pypolars.datatypes.Float64,
pypolars.datatypes.Int64,
pypolars.datatypes.Float64,
pypolars.datatypes.Float64,
pypolars.datatypes.Float64]
df.columns
# there is no .describe() hence use pandas
df.to_pandas().describe()
# Select Columns By Col Name
# same in pandas:: df['carat']
df['col1']
# Select Columns By Index
df[0]
# Select By Index Position for columns
# same in pandas:: df[0]
df.select_at_idx(0)
# Select Multiple Columns
df[['col1','col2']]
# Select rows 0 to 3 and all columns
# same in pandas:: df.iloc[0:3]
df[0:3]
# Slicing 0 to length of rows
# same in pandas:: df.iloc[0:3]
df.slice(0,3)
# Select from 3 rows of a column
df[0:3,"col1"]
# Select from different rows of a column
df[[0,4,10],"col1"]
# Select from different rows of a multiple columns
df[[0,4,10],["col1","col2"]]
df['col1'].value_counts()
# Get Unique Values
df['cut'].unique()
# Boolean Indexing & Condition & Find
# Find all rows/datapoint with cut as ideal
df[df['cut'] == "Ideal"]
# Check For Price More than A Value (300)
df[df['price'] > 300]
# Method 1
df['col1'].apply(round)
# Method 2 Using Lambda
df['col1'].apply(lambda x : round(x))
import pypolars.lazy as plazy
# Define your fxn
def cust_mapcut_values(s: plazy.Series) -> plazy.Series:
return s.apply(lambda x: mydict[x])
# Apply Function
output = df.lazy().with_column(plazy.col('cut').map(cust_mapcut_values))
# Execute and Collect Results
output.collect()
# same in pandas:: df.groupby('cut')['price'].sum()
df.groupby('cut').select('price').sum()
# Selecting Multiple COlumns From Groupby
df.groupby('cut').select(['price','carat']).sum()
# Select Every Column + Fxn
df.groupby(['cut','color']).select_all().first()
# Create A Series
x = pl.Series('X',['A','B','C'])
y = pl.Series('Y',[3,5,4])
# Load Data Vis Pkgs
import matplotlib.pyplot as plt
# Bar Chart Using Matplotlib
plt.bar(x,y)
Dask can parallelise operations equally easily on a computer as on a server. It automatically figures out the cores in a machine and intelligently distributes workload.
pip install dask
pip install “dask[complete]”
// import package
const dataForge = require('data-forge');require('data-forge-fs')
const df = new dataForge.DataFrame({
columnNames:["id","name","sex","age"],
rows:[
[1,"Jesse","male",25],
[2,"Jane","female",25],
[3,"Mark","male",20],
[4,"Peter","male",55],
[5,"Paula","female",35],
]
})
df.toArray()
df.toString()
let data = [{ A: 1, B: 10 }, { A: 2, B: 20 }, { A: 3, B: 30 }];
let df2 = new dataForge.DataFrame(data);
df2.toString()
const df = dataForge.readFileSync('data/sodata.csv').parseCSV();
df.toArray()
df.toRows()
df.toPairs()
df.toJSON()
df.head(5).toString()
df.tail(5).toString()
df.detectTypes().toString()
df.detectValues().toString()
df.getColumnNames()
let df2 = df.renameSeries({"title":"text"})
df2.getColumnNames()
- pandas:: df.iloc[10]
let row10 = df.at(10)
row10
- pandas:: df.loc[10:20]
df.between(10,20).toString()
df.skip(10).take(20).toString()
- pandas df['col1'] or df.col1
df.getSeries("tags").toString()
df.deflate(row=>row.tags).toString()
df.subset(['tags','title']).toString()
df.dropSeries('tags').head(5).toString()
df.dropSeries(['title','tags']).head(5).toArray()
df.where(row=>row['python'] == 0.0).toString()
df.transformSeries({title: value => value.toUpperCase()}).toString()
let pyHeight = df.getSeries("python")
pyHeight.select(value=> value + 2.5).toString()
df.withSeries("Height",pyheight2)
group=>{
return{
Sex:group.first().sex,
Count:group.count()
}
}).inflate();
- Jesus Saves @JCharisTech
- Jesse E.Agbe(JCharis)
- Jesus Saves @JCharisTech
- By Jesse E.Agbe(JCharis)