|
376 | 376 | # map() |
377 | 377 | # replace() |
378 | 378 |
|
379 | | -print(df) |
| 379 | +# print(df) |
380 | 380 |
|
381 | 381 |
|
382 | 382 | # Values in Series that are not in the dictionary |
|
475 | 475 | # 10 NaN |
476 | 476 | # Name: status, dtype: object |
477 | 477 |
|
478 | | -print(df) |
| 478 | +# print(df) |
479 | 479 | # id gender status dept var1 var2 salary |
480 | 480 | # 0 P001 M FT DS 2.0 8.0 NaN |
481 | 481 | # 1 P002 F PT FS 3.0 NaN 54.0 |
|
489 | 489 | # 9 P010 F FT DS NaN 7.0 125.0 |
490 | 490 | # 10 P011 M NaN AWS 6.0 9.0 NaN |
491 | 491 |
|
492 | | -print(df.isnull().sum()) |
| 492 | +# print(df.isnull().sum()) |
493 | 493 | # id 0 |
494 | 494 | # gender 0 |
495 | 495 | # status 3 |
|
499 | 499 | # salary 3 |
500 | 500 | # dtype: int64 |
501 | 501 |
|
502 | | -print(df.isnull().sum(axis=1)) |
| 502 | +# print(df.isnull().sum(axis=1)) |
503 | 503 | # 0 1 |
504 | 504 | # 1 1 |
505 | 505 | # 2 1 |
|
513 | 513 | # 10 2 |
514 | 514 | # dtype: int64 |
515 | 515 |
|
| 516 | +############### Missing value handling methods ################## |
| 517 | + |
| 518 | +# Deleting Rows ----->if it has more than 70-75% of missing values. This |
| 519 | +# percentage can change according to the data. So each situation should be |
| 520 | +# evaluated case by case. |
| 521 | +# |
| 522 | +# Replacing With Mean/Median/Mode (Imputation)--->can be applied on a feature |
| 523 | +# which has numeric data |
| 524 | +# |
| 525 | +# Assigning An Unique Category--->If a categorical feature has definite number |
| 526 | +# of classes, we can assign another class |
| 527 | +# |
| 528 | +# Predicting The Missing Values---> we can predict the nulls with the help of |
| 529 | +# a machine learning algorithm like linear regression |
| 530 | +# |
| 531 | +# Using Algorithms Which Support Missing Values--->KNN is a machine learning |
| 532 | +# algorithm which works on the principle of distance measure. This algorithm |
| 533 | +# can be used when there are nulls present in the dataset. KNN considers the |
| 534 | +# missing values by taking the majority of the K nearest values |
| 535 | + |
| 536 | +# Dropping : |
| 537 | +# dropna() |
| 538 | +# drop() |
| 539 | + |
| 540 | +print(df) |
| 541 | + |
| 542 | +# print(df.dropna(axis=1, how='any', thresh=None, inplace=False)) |
| 543 | +# id gender |
| 544 | +# 0 P001 M |
| 545 | +# 1 P002 F |
| 546 | +# 2 P003 M |
| 547 | +# 3 P004 F |
| 548 | +# 4 P005 M |
| 549 | +# 5 P006 F |
| 550 | +# 6 P007 M |
| 551 | +# 7 P008 F |
| 552 | +# 8 P009 M |
| 553 | +# 9 P010 F |
| 554 | +# 10 P011 M |
| 555 | + |
| 556 | +# print(df.dropna(axis=0, how='any', thresh=None, inplace=False)) |
| 557 | +# id gender status dept var1 var2 salary |
| 558 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 559 | + |
| 560 | +# print(df.dropna(axis=1, how='all', thresh=0, inplace=False)) |
| 561 | +# id gender status dept var1 var2 salary |
| 562 | +# 0 P001 M FT DS 2.0 8.0 NaN |
| 563 | +# 1 P002 F PT FS 3.0 NaN 54.0 |
| 564 | +# 2 P003 M NaN AWS 5.0 5.0 59.0 |
| 565 | +# 3 P004 F FT AWS NaN 8.0 120.0 |
| 566 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 567 | +# 5 P006 F PT NaN 1.0 NaN 75.0 |
| 568 | +# 6 P007 M FT FS NaN NaN NaN |
| 569 | +# 7 P008 F NaN FS 10.0 2.0 136.0 |
| 570 | +# 8 P009 M PT NaN 14.0 3.0 60.0 |
| 571 | +# 9 P010 F FT DS NaN 7.0 125.0 |
| 572 | +# 10 P011 M NaN AWS 6.0 9.0 NaN |
| 573 | + |
| 574 | +# 'any' : If any NA values are present, drop that row or column. |
| 575 | +# 'all' : If all values are NA, drop that row or column. |
| 576 | + |
| 577 | +# default of dropna: |
| 578 | +# axis=0, how='any', thresh=0, inplace=False. |
| 579 | + |
| 580 | +# dropna() with default values : |
| 581 | +# don't forget to use parentheses (). |
| 582 | +# If you don't use it, you can't apply it correctly |
| 583 | +# print(df.dropna()) |
| 584 | +# id gender status dept var1 var2 salary |
| 585 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 586 | + |
| 587 | +# subset: specifies the rows/columns to look for null values. |
| 588 | +# Use subset inside the square brackets [] . |
| 589 | +# print(df.dropna(subset=['status'])) |
| 590 | +# id gender status dept var1 var2 salary |
| 591 | +# 0 P001 M FT DS 2.0 8.0 NaN |
| 592 | +# 1 P002 F PT FS 3.0 NaN 54.0 |
| 593 | +# 3 P004 F FT AWS NaN 8.0 120.0 |
| 594 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 595 | +# 5 P006 F PT NaN 1.0 NaN 75.0 |
| 596 | +# 6 P007 M FT FS NaN NaN NaN |
| 597 | +# 8 P009 M PT NaN 14.0 3.0 60.0 |
| 598 | +# 9 P010 F FT DS NaN 7.0 125.0 |
| 599 | + |
| 600 | + |
| 601 | +df['delete_me'] = np.nan |
| 602 | +# print(df) |
| 603 | +# id gender status dept var1 var2 salary delete_me |
| 604 | +# 0 P001 M FT DS 2.0 8.0 NaN NaN |
| 605 | +# 1 P002 F PT FS 3.0 NaN 54.0 NaN |
| 606 | +# 2 P003 M NaN AWS 5.0 5.0 59.0 NaN |
| 607 | +# 3 P004 F FT AWS NaN 8.0 120.0 NaN |
| 608 | +# 4 P005 M PT DS 7.0 11.0 58.0 NaN |
| 609 | +# 5 P006 F PT NaN 1.0 NaN 75.0 NaN |
| 610 | +# 6 P007 M FT FS NaN NaN NaN NaN |
| 611 | +# 7 P008 F NaN FS 10.0 2.0 136.0 NaN |
| 612 | +# 8 P009 M PT NaN 14.0 3.0 60.0 NaN |
| 613 | +# 9 P010 F FT DS NaN 7.0 125.0 NaN |
| 614 | +# 10 P011 M NaN AWS 6.0 9.0 NaN NaN |
| 615 | + |
| 616 | +df.dropna(axis=1, how='all', thresh=None, inplace=True) |
| 617 | +# print(df) |
| 618 | +# id gender status dept var1 var2 salary |
| 619 | +# 0 P001 M FT DS 2.0 8.0 NaN |
| 620 | +# 1 P002 F PT FS 3.0 NaN 54.0 |
| 621 | +# 2 P003 M NaN AWS 5.0 5.0 59.0 |
| 622 | +# 3 P004 F FT AWS NaN 8.0 120.0 |
| 623 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 624 | +# 5 P006 F PT NaN 1.0 NaN 75.0 |
| 625 | +# 6 P007 M FT FS NaN NaN NaN |
| 626 | +# 7 P008 F NaN FS 10.0 2.0 136.0 |
| 627 | +# 8 P009 M PT NaN 14.0 3.0 60.0 |
| 628 | +# 9 P010 F FT DS NaN 7.0 125.0 |
| 629 | +# 10 P011 M NaN AWS 6.0 9.0 NaN |
| 630 | + |
| 631 | + |
| 632 | +# thresh=N requires that a column has at least N non-NaNs to survive. |
| 633 | +# print(df.dropna(axis=1, how='any', thresh=9, inplace=False)) |
| 634 | +# id gender dept |
| 635 | +# 0 P001 M DS |
| 636 | +# 1 P002 F FS |
| 637 | +# 2 P003 M AWS |
| 638 | +# 3 P004 F AWS |
| 639 | +# 4 P005 M DS |
| 640 | +# 5 P006 F NaN |
| 641 | +# 6 P007 M FS |
| 642 | +# 7 P008 F FS |
| 643 | +# 8 P009 M NaN |
| 644 | +# 9 P010 F DS |
| 645 | +# 10 P011 M AWS |
| 646 | + |
| 647 | +# print(df.drop([1, 3, 5])) |
| 648 | +# id gender status dept var1 var2 salary |
| 649 | +# 0 P001 M FT DS 2.0 8.0 NaN |
| 650 | +# 2 P003 M NaN AWS 5.0 5.0 59.0 |
| 651 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 652 | +# 6 P007 M FT FS NaN NaN NaN |
| 653 | +# 7 P008 F NaN FS 10.0 2.0 136.0 |
| 654 | +# 8 P009 M PT NaN 14.0 3.0 60.0 |
| 655 | +# 9 P010 F FT DS NaN 7.0 125.0 |
| 656 | +# 10 P011 M NaN AWS 6.0 9.0 NaN |
| 657 | + |
| 658 | +# print(df.drop(index=[1, 2, 3])) |
| 659 | +# id gender status dept var1 var2 salary |
| 660 | +# 0 P001 M FT DS 2.0 8.0 NaN |
| 661 | +# 4 P005 M PT DS 7.0 11.0 58.0 |
| 662 | +# 5 P006 F PT NaN 1.0 NaN 75.0 |
| 663 | +# 6 P007 M FT FS NaN NaN NaN |
| 664 | +# 7 P008 F NaN FS 10.0 2.0 136.0 |
| 665 | +# 8 P009 M PT NaN 14.0 3.0 60.0 |
| 666 | +# 9 P010 F FT DS NaN 7.0 125.0 |
| 667 | +# 10 P011 M NaN AWS 6.0 9.0 NaN |
| 668 | + |
| 669 | +# print(df.drop(['var1', 'var2'], axis=1)) |
| 670 | +# id gender status dept salary |
| 671 | +# 0 P001 M FT DS NaN |
| 672 | +# 1 P002 F PT FS 54.0 |
| 673 | +# 2 P003 M NaN AWS 59.0 |
| 674 | +# 3 P004 F FT AWS 120.0 |
| 675 | +# 4 P005 M PT DS 58.0 |
| 676 | +# 5 P006 F PT NaN 75.0 |
| 677 | +# 6 P007 M FT FS NaN |
| 678 | +# 7 P008 F NaN FS 136.0 |
| 679 | +# 8 P009 M PT NaN 60.0 |
| 680 | +# 9 P010 F FT DS 125.0 |
| 681 | +# 10 P011 M NaN AWS NaN |
| 682 | + |
| 683 | +# print(df.drop(columns=['var1', 'var2'])) |
| 684 | +# id gender status dept salary |
| 685 | +# 0 P001 M FT DS NaN |
| 686 | +# 1 P002 F PT FS 54.0 |
| 687 | +# 2 P003 M NaN AWS 59.0 |
| 688 | +# 3 P004 F FT AWS 120.0 |
| 689 | +# 4 P005 M PT DS 58.0 |
| 690 | +# 5 P006 F PT NaN 75.0 |
| 691 | +# 6 P007 M FT FS NaN |
| 692 | +# 7 P008 F NaN FS 136.0 |
| 693 | +# 8 P009 M PT NaN 60.0 |
| 694 | +# 9 P010 F FT DS 125.0 |
| 695 | +# 10 P011 M NaN AWS NaN |
516 | 696 |
|
517 | 697 | # ------------------------------------------------------------ |
| 698 | + |
518 | 699 | # ------------------------------------------------------------ |
519 | 700 | # ------------------------------------------------------------ |
520 | 701 | # ------------------------------------------------------------ |
|
0 commit comments