Skip to content

Commit a474423

Browse files
committed
Dropping added.
1 parent 404e78a commit a474423

File tree

1 file changed

+185
-4
lines changed

1 file changed

+185
-4
lines changed

Handling_with_Missing_Values.py

Lines changed: 185 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@
376376
# map()
377377
# replace()
378378

379-
print(df)
379+
# print(df)
380380

381381

382382
# Values in Series that are not in the dictionary
@@ -475,7 +475,7 @@
475475
# 10 NaN
476476
# Name: status, dtype: object
477477

478-
print(df)
478+
# print(df)
479479
# id gender status dept var1 var2 salary
480480
# 0 P001 M FT DS 2.0 8.0 NaN
481481
# 1 P002 F PT FS 3.0 NaN 54.0
@@ -489,7 +489,7 @@
489489
# 9 P010 F FT DS NaN 7.0 125.0
490490
# 10 P011 M NaN AWS 6.0 9.0 NaN
491491

492-
print(df.isnull().sum())
492+
# print(df.isnull().sum())
493493
# id 0
494494
# gender 0
495495
# status 3
@@ -499,7 +499,7 @@
499499
# salary 3
500500
# dtype: int64
501501

502-
print(df.isnull().sum(axis=1))
502+
# print(df.isnull().sum(axis=1))
503503
# 0 1
504504
# 1 1
505505
# 2 1
@@ -513,8 +513,189 @@
513513
# 10 2
514514
# dtype: int64
515515

516+
############### Missing value handling methods ##################
517+
518+
# Deleting Rows ----->if it has more than 70-75% of missing values. This
519+
# percentage can change according to the data. So each situation should be
520+
# evaluated case by case.
521+
#
522+
# Replacing With Mean/Median/Mode (Imputation)--->can be applied on a feature
523+
# which has numeric data
524+
#
525+
# Assigning An Unique Category--->If a categorical feature has definite number
526+
# of classes, we can assign another class
527+
#
528+
# Predicting The Missing Values---> we can predict the nulls with the help of
529+
# a machine learning algorithm like linear regression
530+
#
531+
# Using Algorithms Which Support Missing Values--->KNN is a machine learning
532+
# algorithm which works on the principle of distance measure. This algorithm
533+
# can be used when there are nulls present in the dataset. KNN considers the
534+
# missing values by taking the majority of the K nearest values
535+
536+
# Dropping :
537+
# dropna()
538+
# drop()
539+
540+
print(df)
541+
542+
# print(df.dropna(axis=1, how='any', thresh=None, inplace=False))
543+
# id gender
544+
# 0 P001 M
545+
# 1 P002 F
546+
# 2 P003 M
547+
# 3 P004 F
548+
# 4 P005 M
549+
# 5 P006 F
550+
# 6 P007 M
551+
# 7 P008 F
552+
# 8 P009 M
553+
# 9 P010 F
554+
# 10 P011 M
555+
556+
# print(df.dropna(axis=0, how='any', thresh=None, inplace=False))
557+
# id gender status dept var1 var2 salary
558+
# 4 P005 M PT DS 7.0 11.0 58.0
559+
560+
# print(df.dropna(axis=1, how='all', thresh=0, inplace=False))
561+
# id gender status dept var1 var2 salary
562+
# 0 P001 M FT DS 2.0 8.0 NaN
563+
# 1 P002 F PT FS 3.0 NaN 54.0
564+
# 2 P003 M NaN AWS 5.0 5.0 59.0
565+
# 3 P004 F FT AWS NaN 8.0 120.0
566+
# 4 P005 M PT DS 7.0 11.0 58.0
567+
# 5 P006 F PT NaN 1.0 NaN 75.0
568+
# 6 P007 M FT FS NaN NaN NaN
569+
# 7 P008 F NaN FS 10.0 2.0 136.0
570+
# 8 P009 M PT NaN 14.0 3.0 60.0
571+
# 9 P010 F FT DS NaN 7.0 125.0
572+
# 10 P011 M NaN AWS 6.0 9.0 NaN
573+
574+
# 'any' : If any NA values are present, drop that row or column.
575+
# 'all' : If all values are NA, drop that row or column.
576+
577+
# default of dropna:
578+
# axis=0, how='any', thresh=0, inplace=False.
579+
580+
# dropna() with default values :
581+
# don't forget to use parentheses ().
582+
# If you don't use it, you can't apply it correctly
583+
# print(df.dropna())
584+
# id gender status dept var1 var2 salary
585+
# 4 P005 M PT DS 7.0 11.0 58.0
586+
587+
# subset: specifies the rows/columns to look for null values.
588+
# Use subset inside the square brackets [] .
589+
# print(df.dropna(subset=['status']))
590+
# id gender status dept var1 var2 salary
591+
# 0 P001 M FT DS 2.0 8.0 NaN
592+
# 1 P002 F PT FS 3.0 NaN 54.0
593+
# 3 P004 F FT AWS NaN 8.0 120.0
594+
# 4 P005 M PT DS 7.0 11.0 58.0
595+
# 5 P006 F PT NaN 1.0 NaN 75.0
596+
# 6 P007 M FT FS NaN NaN NaN
597+
# 8 P009 M PT NaN 14.0 3.0 60.0
598+
# 9 P010 F FT DS NaN 7.0 125.0
599+
600+
601+
df['delete_me'] = np.nan
602+
# print(df)
603+
# id gender status dept var1 var2 salary delete_me
604+
# 0 P001 M FT DS 2.0 8.0 NaN NaN
605+
# 1 P002 F PT FS 3.0 NaN 54.0 NaN
606+
# 2 P003 M NaN AWS 5.0 5.0 59.0 NaN
607+
# 3 P004 F FT AWS NaN 8.0 120.0 NaN
608+
# 4 P005 M PT DS 7.0 11.0 58.0 NaN
609+
# 5 P006 F PT NaN 1.0 NaN 75.0 NaN
610+
# 6 P007 M FT FS NaN NaN NaN NaN
611+
# 7 P008 F NaN FS 10.0 2.0 136.0 NaN
612+
# 8 P009 M PT NaN 14.0 3.0 60.0 NaN
613+
# 9 P010 F FT DS NaN 7.0 125.0 NaN
614+
# 10 P011 M NaN AWS 6.0 9.0 NaN NaN
615+
616+
df.dropna(axis=1, how='all', thresh=None, inplace=True)
617+
# print(df)
618+
# id gender status dept var1 var2 salary
619+
# 0 P001 M FT DS 2.0 8.0 NaN
620+
# 1 P002 F PT FS 3.0 NaN 54.0
621+
# 2 P003 M NaN AWS 5.0 5.0 59.0
622+
# 3 P004 F FT AWS NaN 8.0 120.0
623+
# 4 P005 M PT DS 7.0 11.0 58.0
624+
# 5 P006 F PT NaN 1.0 NaN 75.0
625+
# 6 P007 M FT FS NaN NaN NaN
626+
# 7 P008 F NaN FS 10.0 2.0 136.0
627+
# 8 P009 M PT NaN 14.0 3.0 60.0
628+
# 9 P010 F FT DS NaN 7.0 125.0
629+
# 10 P011 M NaN AWS 6.0 9.0 NaN
630+
631+
632+
# thresh=N requires that a column has at least N non-NaNs to survive.
633+
# print(df.dropna(axis=1, how='any', thresh=9, inplace=False))
634+
# id gender dept
635+
# 0 P001 M DS
636+
# 1 P002 F FS
637+
# 2 P003 M AWS
638+
# 3 P004 F AWS
639+
# 4 P005 M DS
640+
# 5 P006 F NaN
641+
# 6 P007 M FS
642+
# 7 P008 F FS
643+
# 8 P009 M NaN
644+
# 9 P010 F DS
645+
# 10 P011 M AWS
646+
647+
# print(df.drop([1, 3, 5]))
648+
# id gender status dept var1 var2 salary
649+
# 0 P001 M FT DS 2.0 8.0 NaN
650+
# 2 P003 M NaN AWS 5.0 5.0 59.0
651+
# 4 P005 M PT DS 7.0 11.0 58.0
652+
# 6 P007 M FT FS NaN NaN NaN
653+
# 7 P008 F NaN FS 10.0 2.0 136.0
654+
# 8 P009 M PT NaN 14.0 3.0 60.0
655+
# 9 P010 F FT DS NaN 7.0 125.0
656+
# 10 P011 M NaN AWS 6.0 9.0 NaN
657+
658+
# print(df.drop(index=[1, 2, 3]))
659+
# id gender status dept var1 var2 salary
660+
# 0 P001 M FT DS 2.0 8.0 NaN
661+
# 4 P005 M PT DS 7.0 11.0 58.0
662+
# 5 P006 F PT NaN 1.0 NaN 75.0
663+
# 6 P007 M FT FS NaN NaN NaN
664+
# 7 P008 F NaN FS 10.0 2.0 136.0
665+
# 8 P009 M PT NaN 14.0 3.0 60.0
666+
# 9 P010 F FT DS NaN 7.0 125.0
667+
# 10 P011 M NaN AWS 6.0 9.0 NaN
668+
669+
# print(df.drop(['var1', 'var2'], axis=1))
670+
# id gender status dept salary
671+
# 0 P001 M FT DS NaN
672+
# 1 P002 F PT FS 54.0
673+
# 2 P003 M NaN AWS 59.0
674+
# 3 P004 F FT AWS 120.0
675+
# 4 P005 M PT DS 58.0
676+
# 5 P006 F PT NaN 75.0
677+
# 6 P007 M FT FS NaN
678+
# 7 P008 F NaN FS 136.0
679+
# 8 P009 M PT NaN 60.0
680+
# 9 P010 F FT DS 125.0
681+
# 10 P011 M NaN AWS NaN
682+
683+
# print(df.drop(columns=['var1', 'var2']))
684+
# id gender status dept salary
685+
# 0 P001 M FT DS NaN
686+
# 1 P002 F PT FS 54.0
687+
# 2 P003 M NaN AWS 59.0
688+
# 3 P004 F FT AWS 120.0
689+
# 4 P005 M PT DS 58.0
690+
# 5 P006 F PT NaN 75.0
691+
# 6 P007 M FT FS NaN
692+
# 7 P008 F NaN FS 136.0
693+
# 8 P009 M PT NaN 60.0
694+
# 9 P010 F FT DS 125.0
695+
# 10 P011 M NaN AWS NaN
516696

517697
# ------------------------------------------------------------
698+
518699
# ------------------------------------------------------------
519700
# ------------------------------------------------------------
520701
# ------------------------------------------------------------

0 commit comments

Comments
 (0)