improve handling of child processes; fix bug in plot_file for large n…

…umber of children; refactor the profile file reading functions
pythonprofilers · rokroskar · Apr 5, 2017 · Apr 6, 2017 · Apr 6, 2017 · Apr 6, 2017
commit 671f3e2961d242feff2042891714b8f21011c195
diff --git a/memory_profiler.py b/memory_profiler.py
@@ -16,7 +16,7 @@
 import inspect
 import subprocess
 import logging
-
+from collections import defaultdict
 
 # TODO: provide alternative when multiprocessing is not available
 try:
@@ -112,10 +112,10 @@ def _get_child_memory(process, meminfo_attr=None):
     # Loop over the child processes and yield their memory
     try:
         for child in getattr(process, children_attr)(recursive=True):
-            yield getattr(child, meminfo_attr)()[0] / _TWO_20
+            yield child.pid, getattr(child, meminfo_attr)()[0] / _TWO_20
     except psutil.NoSuchProcess:
         # https://github.com/fabianp/memory_profiler/issues/71
-        yield 0.0
+        yield (0,0.0) # need to yield a tuple
 
 
 def _get_memory(pid, backend, timestamps=False, include_children=False, filename=None):
@@ -143,7 +143,7 @@ def ps_util_tool():
                 else 'get_memory_info'
             mem = getattr(process, meminfo_attr)()[0] / _TWO_20
             if include_children:
-                mem +=  sum(_get_child_memory(process, meminfo_attr))
+                mem +=  sum((mem for (pid,mem) in _get_child_memory(process, meminfo_attr)))
             if timestamps:
                 return mem, time.time()
             else:
@@ -355,14 +355,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False,
 
                     # Write children to the stream file
                     if multiprocess:
-                        for idx, chldmem in enumerate(_get_child_memory(proc.pid)):
-                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time()))
+                        for chldpid, chldmem in _get_child_memory(proc.pid):
+                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(chldpid, chldmem, time.time()))
                 else:
                     # Create a nested list with the child memory
                     if multiprocess:
                         mem_usage = [mem_usage]
-                        for chldmem in _get_child_memory(proc.pid):
-                            mem_usage.append(chldmem)
+                        for chldpid, chldmem in _get_child_memory(proc.pid):
+                            mem_usage.append((chldpid,chldmem))
 
                     # Append the memory usage to the return value
                     ret.append(mem_usage)
@@ -399,14 +399,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False,
 
                     # Write children to the stream file
                     if multiprocess:
-                        for idx, chldmem in enumerate(_get_child_memory(proc.pid)):
-                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time()))
+                        for child_pid, chldmem in _get_child_memory(proc):
+                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(child_pid, chldmem, time.time()))
                 else:
                     # Create a nested list with the child memory
                     if multiprocess:
                         mem_usage = [mem_usage]
-                        for chldmem in _get_child_memory(proc.pid):
-                            mem_usage.append(chldmem)
+                        for chldpid, chldmem in _get_child_memory(proc):
+                            mem_usage.append((chldpid,chldmem))
 
                     # Append the memory usage to the return value
                     ret.append(mem_usage)
@@ -1207,3 +1207,134 @@ def flush(self):
             prof.show_results(stream=out_file)
         else:
             show_results(prof, precision=options.precision, stream=out_file)
+
+
+### I/O
+
+def read_mprofile_file(filename):
+    """Read an mprofile file and return its content.
+
+    Returns
+    =======
+    content: dict
+        Keys:
+
+        - "mem_usage": (list) memory usage values, in MiB
+        - "timestamp": (list) time instant for each memory usage value, in
+            second
+        - "func_timestamp": (dict) for each function, timestamps and memory
+            usage upon entering and exiting.
+        - 'cmd_line': (str) command-line ran for this profile.
+    """
+    func_ts = {}
+    mem_usage = []
+    timestamp = []
+    children  = defaultdict(list)
+    cmd_line = None
+    f = open(filename, "r")
+    for l in f:
+        if l == '\n':
+            raise ValueError('Sampling time was too short')
+        field, value = l.split(' ', 1)
+        if field == "MEM":
+            # mem, timestamp
+            values = value.split(' ')
+            mem_usage.append(float(values[0]))
+            timestamp.append(float(values[1]))
+
+        elif field == "FUNC":
+            values = value.split(' ')
+            f_name, mem_start, start, mem_end, end = values[:5]
+            ts = func_ts.get(f_name, [])
+            ts.append([float(start), float(end),
+                       float(mem_start), float(mem_end)])
+            func_ts[f_name] = ts
+
+        elif field == "CHLD":
+            values = value.split(' ')
+            chldnum = values[0]
+            children[chldnum].append(
+                (float(values[1]), float(values[2]))
+            )
+
+        elif field == "CMDLINE":
+            cmd_line = value
+        else:
+            pass
+    f.close()
+
+    return {"mem_usage": mem_usage, "timestamp": timestamp,
+            "func_timestamp": func_ts, 'filename': filename,
+            'cmd_line': cmd_line, 'children': children}
+
+
+def read_mprofile_file_multiprocess(filename):
+    """Read an mprofile file and return a mem_usage list
+
+    Returns
+    =======
+    content: list
+
+    This is analogous to the list obtained when the `memory_usage` is used
+    """
+
+    mem_usage = []
+    sample = []
+
+    f = open(filename,'r')
+
+    for i,l in enumerate(f):
+        if l == '\n':
+            raise ValueError('Sampling time was too short')
+        field, value = l.split(' ', 1)
+        values = value.split(' ')
+
+        if field=="MEM":
+            # append the existing sample and reset to zero
+            mem_usage.append(sample)
+            sample = []
+            sample.append((float(values[0]), float(values[1])))
+        elif field=="CHLD": 
+            sample.append((int(values[0]), float(values[1])))
+
+    f.close()
+    return mem_usage[1:]
+
+
+def convert_mem_usage_to_df(filename, is_pickle=False): 
+    """Convert a `mem_usage` list to a `pandas.DataFrame`
+
+    Returns
+    =======
+    content: pandas.DataFrame
+
+    Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index
+    """
+
+    import pandas as pd
+
+    if is_pickle:
+        from cPickle import load
+        with open(filename) as f: 
+            mem_usage = load(f)
+
+    else:
+        mem_usage = read_mprofile_file_multiprocess(filename)
+        mem_usage = filter(lambda m: len(m) > 1, mem_usage)
+
+    times =[m[0][1] for m in  mem_usage]
+    pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)]))
+
+    time_lookup = {time: i for i,time in enumerate(times)}
+    pid_lookup = {pid:i for i,pid in enumerate(pids)}
+
+    data = np.zeros((len(times), len(pids)))
+
+    for i,m in enumerate(mem_usage):
+        t = m[0][1]
+        try: 
+            for pid,mem in m[1:]:
+                data[time_lookup[t]][pid_lookup[pid]] = mem
+        except TypeError:
+            print 'found a bad value in ', i
+    return pd.DataFrame(data, index=times, columns=pids)
diff --git a/mprof b/mprof
@@ -199,11 +199,17 @@ def run_action():
     parser.add_option("--multiprocess", "-M", dest="multiprocess",
                       default=False, action="store_true",
                       help="""Monitors forked processes creating individual plots for each child""")
+    parser.add_option("--pid", "-p", dest="pid",
+                      default=False, action="store_true", 
+                      help="""Monitor an existing process given by PID""")
+    parser.add_option("--timeout", dest="timeout", 
+                      default=None, action="store", type=int,
+                      help="""Timeout in seconds""")
 
     (options, args) = parser.parse_args()
 
-    if len(args) == 0:
-        print("A program to run must be provided. Use -h for help")
+    if (len(args) == 0) and not options.pid:
+        print("A program to run or a pid must be provided. Use -h for help")
         sys.exit(1)
 
     print("{1}: Sampling memory every {0.interval}s".format(
@@ -218,30 +224,36 @@ def run_action():
     mprofile_output = "mprofile_%s.dat" % suffix
 
     # .. TODO: more than one script as argument ? ..
-    if args[0].endswith('.py') and not options.nopython:
-        if not args[0].startswith("python"):
-            args.insert(0, "python")
-        if options.multiprocess:
-            # in multiprocessing mode you want to spawn a separate
-            # python process
-            options.python = False
-    if options.python:
-        print("running as a Python program...")
-        if not args[0].startswith("python"):
-            args.insert(0, "python")
-        cmd_line = get_cmd_line(args)
-        args[1:1] = ("-m", "memory_profiler", "--timestamp",
-                     "-o", mprofile_output)
-        p = subprocess.Popen(args)
+    if not options.pid:
+        if args[0].endswith('.py') and not options.nopython:
+            if not args[0].startswith("python"):
+                args.insert(0, "python")
+            if options.multiprocess:
+                # in multiprocessing mode you want to spawn a separate
+                # python process
+                options.python = False
+        if options.python:
+            print("running as a Python program...")
+            if not args[0].startswith("python"):
+                args.insert(0, "python")
+            cmd_line = get_cmd_line(args)
+            args[1:1] = ("-m", "memory_profiler", "--timestamp",
+                         "-o", mprofile_output)
+            p = subprocess.Popen(args)
+        else:
+            cmd_line = get_cmd_line(args)
+            p = subprocess.Popen(args)
     else:
-        cmd_line = get_cmd_line(args)
-        p = subprocess.Popen(args)
+        p = int(args[0])
 
     with open(mprofile_output, "a") as f:
-        f.write("CMDLINE {0}\n".format(cmd_line))
+        if not options.pid:
+            f.write("CMDLINE {0}\n".format(cmd_line))
+
         mp.memory_usage(proc=p, interval=options.interval, timestamps=True,
                         include_children=options.include_children,
-                        multiprocess=options.multiprocess, stream=f)
+                        multiprocess=options.multiprocess, stream=f,
+                        timeout=options.timeout)
 
 
 def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None):
@@ -291,61 +303,6 @@ def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None):
         ## pl.plot(xloc[1], yloc[1], ">"+color, markersize=7)
 
 
-def read_mprofile_file(filename):
-    """Read an mprofile file and return its content.
-
-    Returns
-    =======
-    content: dict
-        Keys:
-
-        - "mem_usage": (list) memory usage values, in MiB
-        - "timestamp": (list) time instant for each memory usage value, in
-            second
-        - "func_timestamp": (dict) for each function, timestamps and memory
-            usage upon entering and exiting.
-        - 'cmd_line': (str) command-line ran for this profile.
-    """
-    func_ts = {}
-    mem_usage = []
-    timestamp = []
-    children  = defaultdict(list)
-    cmd_line = None
-    f = open(filename, "r")
-    for l in f:
-        if l == '\n':
-            raise ValueError('Sampling time was too short')
-        field, value = l.split(' ', 1)
-        if field == "MEM":
-            # mem, timestamp
-            values = value.split(' ')
-            mem_usage.append(float(values[0]))
-            timestamp.append(float(values[1]))
-
-        elif field == "FUNC":
-            values = value.split(' ')
-            f_name, mem_start, start, mem_end, end = values[:5]
-            ts = func_ts.get(f_name, [])
-            ts.append([float(start), float(end),
-                       float(mem_start), float(mem_end)])
-            func_ts[f_name] = ts
-
-        elif field == "CHLD":
-            values = value.split(' ')
-            chldnum = values[0]
-            children[chldnum].append(
-                (float(values[1]), float(values[2]))
-            )
-
-        elif field == "CMDLINE":
-            cmd_line = value
-        else:
-            pass
-    f.close()
-
-    return {"mem_usage": mem_usage, "timestamp": timestamp,
-            "func_timestamp": func_ts, 'filename': filename,
-            'cmd_line': cmd_line, 'children': children}
 
 
 def plot_file(filename, index=0, timestamps=True, children=True, options=None):
@@ -355,7 +312,7 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None):
         print("matplotlib is needed for plotting.")
         sys.exit(1)
     import numpy as np  # pylab requires numpy anyway
-    mprofile = read_mprofile_file(filename)
+    mprofile = mp.read_mprofile_file(filename)
 
     if len(mprofile['timestamp']) == 0:
         print('** No memory usage values have been found in the profile '
@@ -413,7 +370,8 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None):
             cmem = np.asarray([item[0] for item in data])
 
             # Plot the line to the figure
-            pl.plot(cts, cmem, "+-"  + mem_line_colors[idx+1 % len(mem_line_colors)],
+            print (idx+1) % len(mem_line_colors)
+            pl.plot(cts, cmem, "+-"  + mem_line_colors[(idx+1) % len(mem_line_colors)],
                      label="child {}".format(proc))
 
             # Detect the maximal child memory point