IDK¶

Try using numpy for operations whenever possible

Use to_numpy() instead of .values

Importing¶

import pandas as pd
pd.set_option('max_columns', 200)

Creating Dataframe¶

Dataset¶

  df = pd.read_csv("dataset.csv")

Manual¶

  perf = pd.DataFrame(
      columns = ['Season', 'Appearances', 'Goals', 'Assists'],
      data = [
          ["2021/2022", 39, 24, 3],
          ["2020/2021", 44, 36, 4],
          ["2019/2020", 46, 37, 7]
      ]
  )

Filtering¶

`query` and `eval`¶

  ## query - better than boolean masking
  df = df.query("""
  @date_start <= Date <= @date_end and \
  Type in @event_type
  """)

  mask = df.eval("something") ## gives the boolean mask corresponding to this

Boolean Masking¶

  df = df[
    (df["Date"] >= date_start) &
    (df["Date"] <= date_end) &
    (df["Type"].isin(event_type))
  ]

Editing Values¶

(
  prediction
  .assign(
    Rating = prediction.Rating.to_numpy() * 100,
    Value = prediction.Value.to_numpy() * 50,
  )
)

prediction[['Rating', 'Value']] = prediction[['Rating', 'Value']].to_numpy() * 100

Read/Write¶

Single File¶

  df = pd.read_csv(
    file,
    engine="pyarrow", backend_dtypes="pyarrow"
  )

  dfs = pd.read_excel('GDSC.xlsx', sheet_name="Something") ## gives all
  dfs = pd.read_excel('GDSC.xlsx', sheet_name=None) ## gives all
  for table, df in dfs.items():
      print(table, df)

  file_name = file[:-4] + ".csv"
  df.to_csv(
    os.getcwd() + "\\" + rel + file_name,
    index = False
  )

Multiple Files¶

  raw_formula_student = pd.DataFrame()
  for file in files:
      if( ".csv" == file[-4:] ):
          raw_formula_student = pd.concat(
              [raw_formula_student, read_file(file)]
          )

Read Google Sheets¶

Best method

    import gspread
    gc = gspread.service_account("key.json")

    from gspread_dataframe import get_as_dataframe as get_gsheet, set_with_dataframe as set_gsheet

    from functools import lru_cache

    def gsheet_to_csv(spreadsheet_id, sheet_id=None, sheet_name=None):
      ## make sure the spreadsheet is publicly viewable
      if sheet_id is not None:
        link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&gid={sheet_id}"
      elif sheet_name is not None:
        link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
      else:
        return None

        df = pd.read_csv(
          link,
            engine = "pyarrow",
          backend_dtypes = "pyarrow"
        )
        return df

    def gsheet_by_api(spreadsheet_id, sheet_id=None, sheet_name=None):
          gsheet = gc.open_by_key(spreadsheet_id)
      if sheet_id is not None:
        sheet = gsheet.get_worksheet_by_id(sheet_id)
      elif sheet_name is not None:
        sheet = getattr(gsheet, sheet_name)
      else:
        return None

      df = get_gsheet(sheet, evaluate_formulas=True)
        return df

    @lru_cache(maxsize = 128) ## or st.cache_data(ttl=ttl_long)
    def read_gsheet(spreadsheet_id, sheet_id=None, sheet_name=None, parse_dates=None, csv=True):
        if csv is True:
          df = gsheet_to_csv(spreadsheet_id, sheet_id, sheet_name)
        else:
          df = gsheet_by_api(spreadsheet_id, sheet_id, sheet_name)

      df = df.dropna(how= "all", axis="index")
      df = df.dropna(how= "all", axis="columns")

      for col_name in df.columns:
        if "date" in col_name.lower() or "time".lower() in col_name:
          df[col_name] = pd.to_datetime(df[col_name])

      return df

Old method (Not working)

    gsheetkey = "1kax9m1FKah7cWPwylxhdJSyqF5eVALjgRbxyPuPg7g0"
    sheet_name = 'Social_Media_Analysis'
    url= f'[لم يتم العثور على الصفحة](https://docs.google.com/spreadsheet/ccc?key={gsheetkey}&output=xlsx')

    sheet = pd.read_excel(url, sheet_name = sheet_name)

Useful Functions¶

df.head()
df.tail()

df.describe()
df.dropna()

df.select_dtypes(int)

df.memory_usage(deep=True).sum()/1_000_000 ## in MBs

df = df.set_index(
    "Season",
    "Player Name"
)

df.unique()
df.value_counts()
100 * df['col'].value_counts() / df['col'].shape[0]

df['Age'].avg()

Function	Function Application
`pipe()`	Table-wise
`assign()`
`apply()`	Row/column-wise (not good to use)
`applymap()`	Element-wise (not good to use)

df.iloc[:, 2:11] = (
  df.iloc[:, 2:11]
  .apply(lambda x: x.str.replace(',', '.'))
  .to_numpy()
  .astype(float)
)

View null values¶

  formula_student[
      formula_student.isna().any(axis=1)
  ]

Correlation¶

Correlation Matrix¶

  (
    formula_student
    [['Cost', 'Design', 'Overall Scores']]
    .corr()
  )

Correlation Ranking¶

  (
      formula_student
      .iloc[:, 1:11]
      .corr("Overall Scores")
      .sort_values("Correlation", ascending=False)
      .iloc[1:, :] ## remove the obvious overall scores = 1.00
  )

  ## this is unnecessarily complicated way i used before
  (
      formula_student
      .iloc[:, 1:11]
      .corr()
      .rename(columns={"Overall Scores":"Correlation"})
      [["Correlation"]]
      .sort_values("Correlation", ascending=False)
      .iloc[1:, :] ## remove the obvious overall scores = 1.00
  )

Deleting¶

Drop first \(n\) records

  df = df.iloc[n: , : ]

Sorting¶

(
  perf
  .sort_values("Season")
  .reset_index(drop=True)
)

(
  perf
  .sort_values([
    "Season",
    "Rating",
    "Value"
  ], ascending=[
    True,
    True,
    False
  ])
  .reset_index(drop=True)
)

Different approach

  (
    formula_student[["Cost", "Overall Placing"]]
    .sort_values("Cost", ascending = False)
    .head(10)
    .sort_values("Overall Placing")
  )

Grouping/Aggregation¶

Make sure to use observed = True always, especially when using categorical columns

meanCols = {
    "Value": "MeanValue",
    "Overall": "MeanOverall",
    "CPIValue": "MeanCPIValue"
}

Single Aggregation Function¶

df.values.mean()
# don't do df.mean().mean()

np.nanmean(df.values)
np.nanstd(df.values)

All Columns¶

    (
      merged
      .groupby(
        ["Year"],
        observed = True,
        as_index = False, # if Year should not become index of dataframe
      )
      .mean()
      .rename(columns={
        "Value": "MeanValue",
        "Overall": "MeanOverall",
        "CPIValue": "MeanCPIValue"
      })
    )
    ## or mean = merged.groupby(["Year"]).mean().reset_index()

Renaming is for obvious reasons

Particular Columns¶

    mean = merged[["Year","Value"]].groupby(
      ["Year"],
      as_index=False
    ).mean().rename(columns=meanCols)

Multiple Aggregation Function¶

  summary_df = (
      df[["latitude", "longitude", "emission"]]
      .groupby(["latitude", "longitude"], as_index=False)
      .agg({
          "emission": ["median","std", "mean", "min", "max"]
      })
  )
  summary_df.columns = summary_df.columns.map('_'.join)

  df = df.merge(summary_df, how="inner")

  (
    merged
    .groupby(
      ["Year"],
      as_index=False
    )
    .agg(
      ['mean', 'sum']
    )
  )

Round-off all results

    formula_student[["Car", "Overall Scores"]]
        .groupby(
            ["Competition"]
        )
        .agg({
            'Car' : ["count"],
            'Overall Scores' : ["mean", "min", "max"]
        })
        .round(1)

Round-off specific results

    def mean_func(x):
        return round(x.mean(), 1)

    formula_student[["Car", "Overall Scores"]]
        .groupby(
            ["Competition"]
        )
        .agg({
            'Car' : ["count"],
            'Overall Scores' : [mean_func, "min", "max"]
        })

IDK¶

The count of each value until it changes to another value

  index  value
      0     10
      1     10
      2     23
      3     23
      4      9
      5      9
      6      9
      7     10
      8     10
      9     10
     10     10
     11     12

  ---

  index count
     10     2
     23     2
      9     3
     10     4
     12     1

  col = 'col_name'

  df = (
    df
    .groupby(
      df[col]
      .ne( ## not equal to previous value; ie change occured
        df[col]
        .shift()
      )
      .cumsum()
    )
    [col]
    .value_counts()
    .reset_index(level=0, drop=True)
  )

Explanation

      ## This is the intermediate dataframe produced
      ## We then group by cumsum

      index  value  shifted  not_equal  cumsum
          0     10      NaN       True       1
          1     10     10.0      False       1
          2     23     10.0       True       2
          3     23     23.0      False       2
          4      9     23.0       True       3
          5      9      9.0      False       3
          6      9      9.0      False       3
          7     10      9.0       True       4
          8     10     10.0      False       4
          9     10     10.0      False       4
         10     10     10.0      False       4
         11     12     10.0       True       5

  col = 'col_name'

  changes = (
    df[col]
    .diff()
    .ne(0)
    .cumsum()
  )

  df = (
    df
    .groupby([changes,col])
    .size()
    .reset_index(level=0, drop=True)
  )

Join/Merge¶

merged = pd.merge(
    ratings[ratings["Value"] >= 10000],
    cpi
).sort_values("Value").reset_index(drop=True)

Plotting¶

merged.plot(
  x="Season",
  ylabel="Feature",
  title="Features over Time"
)

Plotting Backend¶

  pd.options.plotting.backend = 'plotly'
  df.plot(backend='plotly')

Type Casting¶

df['Fee'] = df['Fee'].to_numpy().astype(float)
## df['Fee'] = df['Fee'].astype(float)

Lagged Value¶

alias::shift lag

nba["wpc_lag"] = (
  nba
  .groupby("Team")
  ["wpc"]
  .shift(1)
)

Performance Optimization¶

https://pythonspeed.com/datascience

Selectively-load columns¶

  df = pd.read_csv(
    "dataset.csv",
    usecols = [
      "First Name",
      "Last Name"
    ]
  )

Selectively-load rows¶

  df = pd.read_csv(
    "dataset.csv",
    nrows = 10
  )

Reducing variables in functions¶

  def process_data():
      return modify2(modify1(load_1GB_of_data()))

instead of

  def process_data():
      data = load_1GB_of_data() ## ← `data` var lives too long
      return modify2(modify1(data))

Use correct `dtypes`¶

  df = pd.read_csv(
    "dataset.csv",
    dtype = {
      "Age": "uint8",
      "Year": "uint16",
      "Time": "datetime"
      "Salary": "ufloat32",
      "Gender": "category",
      "Name": "string[pyarrow]"
    }
  )

Dynamically cleaning up after reading based on datatype; but i would recommend above

    def get_optimal_numeric_type(c_min: float, c_max: float, col_type: str) -> str:
        """
        Determines the optimal numeric data type for a given range of values.

        Parameters
        ----------
        c_min : float
            The minimum value of the data.
        c_max : float
            The maximum value of the data.
        col_type : str
            The current data type of the column ('int' or 'float').

        Returns
        -------
        optimal_type : str
            The optimal data type for the given range of values.
        """
        type_info = np.iinfo if col_type == 'int' else np.finfo
        for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
            if col_type in str(dtype):
                if c_min > type_info(dtype).min and c_max < type_info(dtype).max:
                    return dtype
        return None

    """ Based on the data type and the range of values, the function determines the smallest possible data type that can accommodate the data without losing information. For example, if the data type is an integer and the range of values fits within the bounds of an int8 data type, the function converts the column data type to int8: """

    def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
        """
        Reduces memory usage of a pandas DataFrame by converting its columns to the most memory-efficient data types
        without losing information.

        Parameters
        ----------
        df : pd.DataFrame
            The input pandas DataFrame that needs memory optimization.

        Returns
        -------
        df : pd.DataFrame
            The optimized pandas DataFrame with reduced memory usage.
        """

        ## Iterate through each column in the DataFrame
        df_copy = df.copy()
        for col in df_copy.columns:
            col_type = df_copy[col].dtype

            ## Check if the data type is not an object (i.e., numeric type)
            if col_type != object:
                c_min, c_max = df_copy[col].min(), df_copy[col].max()
                col_type_str = 'int' if 'int' in str(col_type) else 'float'
                optimal_type = get_optimal_numeric_type(c_min, c_max, col_type_str)
                if optimal_type:
                    df_copy[col] = df_copy[col].astype(optimal_type)
            ## If the data type is an object, convert the column to a 'category' data type
            else:
                df_copy[col] = df_copy[col].astype('category')

        ## Return the optimized DataFrame with reduced memory usage
        return df_copy

Use `[[pyarrow]]` engine¶

  df = pd.read_csv(
    "dataset.csv",
    engine = "pyarrow"
  )

Use Modin¶

Clipping¶

## using pandas
df = df.clip(
  lower = df['Column'].quantile(.25, interpolation="midpoint"),           
  upper = df['Column'].quantile(.75, interpolation="midpoint")
)

Chaining¶

(
  df
  .pipe(function)
  .assign(
    ensnetns = nesntens
  )
  .astype({"highway": np.int8})
)

SQL¶

df = pd.read_sql(query, engine)

Exporting to HTML¶

html = """
<head>
  <link rel="stylesheet" href="../../../backend/plugins/bootstrap/bootstrap.min.css">
  <script src="../../../backend/plugins/bootstrap/bootstrap.min.js"></script>
</head>
<body>
"""
<body>
html += blah_df.head.to_html(classes='table table-stripped')
html += "</body>"

exporter = open("blah_df.html", 'w')
exporter.write(html)
exporter.close()

Get¶

Get last letter of every row¶

  spf_unrate["PERIOD"].str.get(-1)

First Name¶

  df["Name"] = df["Name"].str.split().str.get(0)

Year/Month/Date¶

  fred["YEAR"] = pd.DatetimeIndex(fred['DATE']).year
  fred = fred.drop("DATE", axis=1)
  fred

Time Series¶

df.shift(10)
df.diff(10)

## window functions
df.rolling(10).mean()
df.rolling(10, center=True).mean()

df.expanding(10).mean()
df.expanding(10, center=True).mean()

## exponential weighted window
df.ewm(10).mean()
df.ewm(10, center=True).mean()

Faster windows using numpy¶

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

np.mean(rolling_window(s, 2), axis=1)
np.median(rolling_window(s, 2), axis=1)

Missing Values¶

def check(data):
  print(data.isnull().values.any())

sheet.apply(check)          ## check if col has missing value
sheet.apply(check, axis=1)  ## check if row has missing value
sheet.pipe(check)           ## check if dataframe has missing value

My Cleaning¶

def clean_df(data):
  df = data.copy()

  for column in df.columns:
    if "date" in column:
      df[column] = pd.to_datetime(sheet[column])

  df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

  return df

sheet.pipe(clean_df)

2025-07-26

IDK¶

Importing¶

Creating Dataframe¶

Dataset¶

Manual¶

Filtering¶

query and eval¶

Boolean Masking¶

Editing Values¶

Read/Write¶

Single File¶

Multiple Files¶

Read Google Sheets¶

Useful Functions¶

View null values¶

Correlation¶

Correlation Matrix¶

Correlation Ranking¶

Deleting¶

Sorting¶

Grouping/Aggregation¶

Single Aggregation Function¶

All Columns¶

Particular Columns¶

Multiple Aggregation Function¶

IDK¶

Join/Merge¶

Plotting¶

Plotting Backend¶

Type Casting¶

Lagged Value¶

Performance Optimization¶

Selectively-load columns¶

Selectively-load rows¶

Reducing variables in functions¶

Use correct dtypes¶

Use [[pyarrow]] engine¶

Use Modin¶

Clipping¶

Chaining¶

SQL¶

Exporting to HTML¶

Get¶

Get last letter of every row¶

First Name¶

Year/Month/Date¶

Time Series¶

Faster windows using numpy¶

Missing Values¶

My Cleaning¶

Comments

`query` and `eval`¶

Use correct `dtypes`¶

Use `[[pyarrow]]` engine¶