## query - better than boolean maskingdf=df.query(""" @date_start <= Date <= @date_end and \ Type in @event_type """)mask=df.eval("something")## gives the boolean mask corresponding to this
importgspreadgc=gspread.service_account("key.json")fromgspread_dataframeimportget_as_dataframeasget_gsheet,set_with_dataframeasset_gsheetfromfunctoolsimportlru_cachedefgsheet_to_csv(spreadsheet_id,sheet_id=None,sheet_name=None):## make sure the spreadsheet is publicly viewableifsheet_idisnotNone:link=f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&gid={sheet_id}"elifsheet_nameisnotNone:link=f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"else:returnNonedf=pd.read_csv(link,engine="pyarrow",backend_dtypes="pyarrow")returndfdefgsheet_by_api(spreadsheet_id,sheet_id=None,sheet_name=None):gsheet=gc.open_by_key(spreadsheet_id)ifsheet_idisnotNone:sheet=gsheet.get_worksheet_by_id(sheet_id)elifsheet_nameisnotNone:sheet=getattr(gsheet,sheet_name)else:returnNonedf=get_gsheet(sheet,evaluate_formulas=True)returndf@lru_cache(maxsize=128)## or st.cache_data(ttl=ttl_long)defread_gsheet(spreadsheet_id,sheet_id=None,sheet_name=None,parse_dates=None,csv=True):ifcsvisTrue:df=gsheet_to_csv(spreadsheet_id,sheet_id,sheet_name)else:df=gsheet_by_api(spreadsheet_id,sheet_id,sheet_name)df=df.dropna(how="all",axis="index")df=df.dropna(how="all",axis="columns")forcol_nameindf.columns:if"date"incol_name.lower()or"time".lower()incol_name:df[col_name]=pd.to_datetime(df[col_name])returndf
Old method (Not working)
gsheetkey="1kax9m1FKah7cWPwylxhdJSyqF5eVALjgRbxyPuPg7g0"sheet_name='Social_Media_Analysis'url=f'[لم يتم العثور على الصفحة](https://docs.google.com/spreadsheet/ccc?key={gsheetkey}&output=xlsx')sheet=pd.read_excel(url,sheet_name=sheet_name)
(formula_student.iloc[:,1:11].corr("Overall Scores").sort_values("Correlation",ascending=False).iloc[1:,:]## remove the obvious overall scores = 1.00)
## this is unnecessarily complicated way i used before(formula_student.iloc[:,1:11].corr().rename(columns={"Overall Scores":"Correlation"})[["Correlation"]].sort_values("Correlation",ascending=False).iloc[1:,:]## remove the obvious overall scores = 1.00)
(merged.groupby(["Year"],observed=True,as_index=False,# if Year should not become index of dataframe).mean().rename(columns={"Value":"MeanValue","Overall":"MeanOverall","CPIValue":"MeanCPIValue"}))## or mean = merged.groupby(["Year"]).mean().reset_index()
col='col_name'df=(df.groupby(df[col].ne(## not equal to previous value; ie change occureddf[col].shift()).cumsum())[col].value_counts().reset_index(level=0,drop=True))
Explanation
## This is the intermediate dataframe produced
## We then group by cumsum
index value shifted not_equal cumsum
0 10 NaN True 1
1 10 10.0 False 1
2 23 10.0 True 2
3 23 23.0 False 2
4 9 23.0 True 3
5 9 9.0 False 3
6 9 9.0 False 3
7 10 9.0 True 4
8 10 10.0 False 4
9 10 10.0 False 4
10 10 10.0 False 4
11 12 10.0 True 5
Dynamically cleaning up after reading based on datatype; but i would recommend above
defget_optimal_numeric_type(c_min:float,c_max:float,col_type:str)->str:""" Determines the optimal numeric data type for a given range of values. Parameters ---------- c_min : float The minimum value of the data. c_max : float The maximum value of the data. col_type : str The current data type of the column ('int' or 'float'). Returns ------- optimal_type : str The optimal data type for the given range of values. """type_info=np.iinfoifcol_type=='int'elsenp.finfofordtypein[np.int8,np.int16,np.int32,np.int64,np.float16,np.float32,np.float64]:ifcol_typeinstr(dtype):ifc_min>type_info(dtype).minandc_max<type_info(dtype).max:returndtypereturnNone""" Based on the data type and the range of values, the function determines the smallest possible data type that can accommodate the data without losing information. For example, if the data type is an integer and the range of values fits within the bounds of an int8 data type, the function converts the column data type to int8: """defreduce_memory_usage(df:pd.DataFrame)->pd.DataFrame:""" Reduces memory usage of a pandas DataFrame by converting its columns to the most memory-efficient data types without losing information. Parameters ---------- df : pd.DataFrame The input pandas DataFrame that needs memory optimization. Returns ------- df : pd.DataFrame The optimized pandas DataFrame with reduced memory usage. """## Iterate through each column in the DataFramedf_copy=df.copy()forcolindf_copy.columns:col_type=df_copy[col].dtype## Check if the data type is not an object (i.e., numeric type)ifcol_type!=object:c_min,c_max=df_copy[col].min(),df_copy[col].max()col_type_str='int'if'int'instr(col_type)else'float'optimal_type=get_optimal_numeric_type(c_min,c_max,col_type_str)ifoptimal_type:df_copy[col]=df_copy[col].astype(optimal_type)## If the data type is an object, convert the column to a 'category' data typeelse:df_copy[col]=df_copy[col].astype('category')## Return the optimized DataFrame with reduced memory usagereturndf_copy
defcheck(data):print(data.isnull().values.any())sheet.apply(check)## check if col has missing valuesheet.apply(check,axis=1)## check if row has missing valuesheet.pipe(check)## check if dataframe has missing value