Source code for craw.scripts.craw_htmp

#! /usr/bin/env python3

###########################################################################
#                                                                         #
# This file is part of Counter RNAseq Window (craw) package.              #
#                                                                         #
#    Authors: Bertrand Neron                                              #
#    Copyright (c) 2017-2019  Institut Pasteur (Paris).                   #
#    see COPYRIGHT file for details.                                      #
#                                                                         #
#    craw is free software: you can redistribute it and/or modify         #
#    it under the terms of the GNU General Public License as published by #
#    the Free Software Foundation, either version 3 of the License, or    #
#    (at your option) any later version.                                  #
#                                                                         #
#    craw is distributed in the hope that it will be useful,              #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of       #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                 #
#    See the GNU General Public License for more details.                 #
#                                                                         #
#    You should have received a copy of the GNU General Public License    #
#    along with craw (see COPYING file).                                  #
#    If not, see <http://www.gnu.org/licenses/>.                          #
#                                                                         #
###########################################################################

import sys
import os
import argparse
import logging

import matplotlib as mtp
import matplotlib.pyplot as plt

import craw


[docs]def _file_readable(value): """ check value given by the parser :param str value: the value given by the parser for --sort-using-file option :return: the normpath of the value :raises: ArgumentError if the file does not exists, or is not a file, or not readable """ if not os.path.exists(value): raise argparse.ArgumentError(None, "No such file: {}".format(value)) elif not os.path.isfile(value): raise argparse.ArgumentError(None, "{} is not a regular file".format(value)) elif not os.access(value, os.R_OK): raise argparse.ArgumentError(None, "{} is not readable".format(value)) return os.path.normpath(value)
[docs]def _gene_size_parser(value): """ Parse value given by the parser :param value: the value given by the parser for --sort-by-gene-size option :type value: string :return: name of column representing the start of gene, name of the column representing the end of gene :rtype: tuple of 2 string :raise: :class:`argparse.ArgumentError` object if value cannot be parsed """ # this function is called only if value is provided to sort-by-gene-size option rv = value.split(',') if len(rv) == 0: rv = ['', ''] elif len(rv) == 2: pass else: raise argparse.ArgumentError(None, "--sort-by-gene-size {} invalid value. " "Must be start_col, stop_col (separated by a comma), " "default= annotation_start,annotation_end".format(value)) from None return rv
[docs]def _size_fig_parser(value): """ Parse value given by the parser for --size option the value must follow the syntax widexheight[unit] if the unit is omitted unit is inch otherwise unit must be * 'mm' for millimeters * 'cm' for centimeters * 'in' for inches * 'px' for pixels wide and height must be positive integers. :param value: the size of the figure :type value: string :return: the size in inch ask by the user :rtype: tuple of float :raise: :class:`argparse.ArgumentError` object """ def mm2in(value): return value * (1 / 25.4) def cm2in(value): return mm2in(value * 10) def px2in(value): dpi = plt.rcParams['figure.dpi'] return value / dpi err_msg = """--size {} invalid value. The value must be widexheight[unit] or 'raw'. 'wide' and 'height' must be positive integers By default unit is in inches. eg: 7x10 or 7x10in for 7 inches wide by 10 inches height 70x100mm for 70 mm by 100 mm. default=7x10 or 10x7 depending of the figure orientation.""" if value == 'raw': return 'raw' else: unit = 'in' if value[-2:] in ('mm', 'cm', 'in', 'px'): unit = value[-2:] value = value[:-2] try: wide, height = value.split('x') except ValueError: raise argparse.ArgumentError(None, err_msg.format(value)) try: wide = int(wide) height = int(height) except ValueError: raise argparse.ArgumentError(None, err_msg.format(value)) if wide < 0 or height < 0: raise argparse.ArgumentError(None, err_msg.format(value)) if unit == 'mm': wide = mm2in(wide) height = mm2in(height) elif unit == 'cm': wide = cm2in(wide) height = cm2in(height) elif unit == 'px': wide = px2in(wide) height = px2in(height) return wide, height
[docs]def get_version_message(): """ :return: a human readable of craw_htmp version and it's main dependencies. :rtype: str """ # pyplot must be import after the argument parsing # because in function of the environment ($DYSPLAY and options) the craw_htmp behavior # is not the same # so heatmap cannot be import before argument parsing # so I import the dependencies in get_version_message. # it's not very important as get_version_message is called only when --version is set # so the program quit after displaying this message. import numpy as np import pandas as pd import matplotlib as mtp import PIL version_text = craw.get_version_message() version_text += """ Using: - numpy {np_ver} - pandas {pd_ver} - matplotlib {mtp_ver} - pillow {pil_ver} """.format(np_ver=np.__version__, pd_ver=pd.__version__, mtp_ver=mtp.__version__, pil_ver=PIL.PILLOW_VERSION ) return version_text
[docs]def parse_args(args): """ :param args: the arguments and option as provided by sys.argv without the program name :return: the argument parsed :rtype: :class:`argparse.Namespace` object """ parser = argparse.ArgumentParser(description="Compute a figure from a file of coverage compute by craw_coverage.py.", formatter_class=argparse.RawTextHelpFormatter) parser.add_argument(dest="cov_file", help="the path to the coverage file.") data_grp = parser.add_argument_group('optional data options') data_grp.add_argument("--crop", nargs=2, help="""crop the matrix. This option need two values the name of the first and last column to keep [start col, stop col] eg --crop -10 1000 .""") sort_grp = data_grp.add_mutually_exclusive_group() sort_grp.add_argument("--sort-using-col", default=False, help="sort the rows using the column COL.") sort_grp.add_argument("--sort-using-file", type=_file_readable, help="""Sort the rows using a file. The file must have on the first line the name of the column which will use to sort. Each following lines must match to a value of this column in the data.""") sort_grp.add_argument("--sort-by-gene-size", type=_gene_size_parser, nargs='*', metavar='start_col,stop_col', help="""The rows will be sorted by gene size using start-col and stop-col to compute length. start-col and stop-col must be a string separated by comma. If start-col and stop-col are not specified annotation_start and annotation_end for start-col and stop-col respectively will be used. (Don't put this option without value just before the coverage file.)""") matrix_grp = data_grp.add_mutually_exclusive_group() matrix_grp.add_argument("--sense-only", action="store_true", default=False, help="Display only sense matrix (default is display both).") matrix_grp.add_argument("--antisense-only", action="store_true", default=False, help="Display only anti sense matrix (default is display both).") fig_grp = parser.add_argument_group("optional figure options") fig_grp.add_argument("--cmap", default="Blues", help="""The color map used to display data. The allowed values are defined in http:matplotlib.org/examples/color/colormaps_reference.html eg: Blues, BuGn, Greens, GnBu, ... (default: Blues).""") fig_grp.add_argument("--title", help="""The figure title. It will display on the top of the figure. (default: the name of the coverage file without extension).""") fig_grp.add_argument("--dpi", type=int, help="""The resolution of the output . This option work only if --out option is specified with size not raw. (default: matplolibrc figure.dpi""") fig_grp.add_argument("--size", type=_size_fig_parser, help="""Specify the figure size. The value must be widexheight[unit] or 'raw'. If value is raw it will be produce two image files (for sense and antisense) " with one pixel correspond to one coverage value for one nucleotide. Otherwise, 'wide' and 'height' must be positive integers By default units are in inches eg: * 7x10 or 7x10in for 7 inches wide by 10 inches height * 70x100mm for 70 mm by 100 mm. (default: 7x10 or 10x7 depending of the figure orientation).""") fig_grp.add_argument("--norm", choices=["lin", "log", "row", "log+row", "row+log"], default="lin", help="""Which normalization to apply to the data before display them. * lin a linear normalization is applied on the whole matrix. * log a 10 base logarithm will be applied on the data before matrix normalization. * row mean that a linear normalisation is compute row by row. * log+row mean a 10 base logarithm will be applied before a normalisation row by row. ('row+log' is an alias for 'log+row'). (default: lin""") fig_grp.add_argument("--mark", action='append', nargs='*', metavar='POS [COLOR]', help="""* POS is mandatory and must be a positive integer. * COLOR is optional The supported color formats are: - Hexadecimal color specifiers, given as '#rgb' or '#rrggbb'. For example, '#ff0000' specifies pure red. - Common HTML color names. If COLOR is omitted the color corresponding to the highest value in the color map (--cmap) will be used. (this option cannot be the last one just before the coverage file, on the command line.)""") layout = fig_grp.add_mutually_exclusive_group() layout.add_argument("--sense-on-left", action="store_true", default=False, help="Where to display the sense matrix relative to antisense matrix (default is top).") layout.add_argument("--sense-on-right", action="store_true", default=False, help="Where to display the sense matrix relative to antisense matrix (default is top).") layout.add_argument("--sense-on-top", action="store_true", default=False, help="Where to display the sense matrix relative to antisense matrix (default is top).") layout.add_argument("--sense-on-bottom", action="store_true", default=False, help="Where to display the sense matrix relative to antisense matrix (default is top).") parser.add_argument("--out", help="""The name of the file (the format will based on the extension) to save the figure. Instead of displaying the figure on the screen, save it directly in this file. If this option is used with --size raw 2 files will be produced for respectively sense and anti sense. The extension 'sense' or 'antisense' will be added between the name and the suffix eg: --size raw --out foo.png give 2 files 'foo.sense.png' and 'foo.antisense.png'. If no format (determine using the suffix) is given 'png' will be used.""") parser.add_argument("-v", "--verbose", action='count', default=0, help="Increase output verbosity.") parser.add_argument("-q", "--quiet", action="count", default=0, help="Reduce output verbosity.") parser.add_argument("--version", action='version', version=get_version_message(), help="display the version information and quit.") parsed_args = parser.parse_args(args) return parsed_args
[docs]def main(args=None, log_level=None, logger_out=True): """ The entrypoint for craw_html script It will generate a heatmap representing the coverage matrix around the position of interest it can display the results on the screen or write it on file depending the options :param args: The arguments and option representing the command line :type args: list of string :param log_level: the level of verbosity :param logger_out: True if you want to display logs on stdout, False otherwise :return: """ args = sys.argv[1:] if args is None else args parsed_args = parse_args(args) verbosity = max(logging.INFO + (parsed_args.quiet - parsed_args.verbose) * 10, 1) craw.init_logger(verbosity, out=logger_out) log = logging.getLogger('craw.htmp') log.debug("args={}".format(parsed_args)) ########################### # validating some options # ########################### # test if DISPlAY if not os.environ.get('DISPLAY'): if parsed_args.out is not None: _, out_format = os.path.splitext(parsed_args.out) if not out_format: raise RuntimeError(""" 'DISPLAY' variable is not set (you probably run craw_htmp in non graphic environment) So you must specify an output format (add ext to the output file option as 'my_file.png') """) from craw.util import non_interactive_backends try: backend = non_interactive_backends[out_format.lstrip('.')] except KeyError: raise RuntimeError("The '{}' format is not supported, choose among {}.".format( out_format, list(non_interactive_backends.keys()) )) mtp.use(backend) else: raise RuntimeError(""" 'DISPLAY' variable is not set (you probably run craw_htmp in non graphic environment) So you cannot use interactive output please specify an output file (--out). """) # the import of pyplot must be done after setting the backend import matplotlib.pyplot as plt from craw import heatmap try: color_map = plt.cm.get_cmap(parsed_args.cmap) except(AttributeError, ValueError) as err: raise RuntimeError("{} : http:matplotlib.org/examples/color/colormaps_reference.html for example".format(err)) if parsed_args.sort_by_gene_size == []: parsed_args.sort_by_gene_size = ['annotation_start', 'annotation_end'] elif parsed_args.sort_by_gene_size: parsed_args.sort_by_gene_size = parsed_args.sort_by_gene_size[0] if parsed_args.out: out_dir = os.path.dirname(parsed_args.out) or '.' if not os.access(out_dir, os.W_OK): msg = "{} is not writable".format(out_dir) log.error(msg) raise RuntimeError(msg) log.info("Parsing coverage file") data = heatmap.get_data(parsed_args.cov_file) def mark_converter(data, marks): converted_marks = [] for value in marks: pos = int(value[0]) color = value[1] if len(value) == 2 else None converted_marks.append(heatmap.Mark(pos, data, color_map, color=color)) return converted_marks if parsed_args.mark is not None: parsed_args.mark = mark_converter(data, parsed_args.mark) sense_data, antisense_data = heatmap.split_data(data) if parsed_args.sense_only: antisense_data = None if parsed_args.antisense_only: sense_data = None ################ # sorting data # ################ # if data is empty or data is None sort return data # so it's error safe, and not time and space consuming if parsed_args.sort_by_gene_size: start_col, stop_col = parsed_args.sort_by_gene_size sense_data = heatmap.sort(sense_data, 'by_gene_size', start_col=start_col, stop_col=stop_col) antisense_data = heatmap.sort(antisense_data, 'by_gene_size', start_col=start_col, stop_col=stop_col) elif parsed_args.sort_using_col: sense_data = heatmap.sort(sense_data, 'using_col', col=parsed_args.sort_using_col) antisense = heatmap.sort(antisense_data, 'using_col', col=parsed_args.sort_using_col) elif parsed_args.sort_using_file: sense_data = heatmap.sort(sense_data, 'using_file', file=parsed_args.sort_using_file) antisense_data = heatmap.sort(antisense_data, 'using_file', file=parsed_args.sort_using_file) sense_data = heatmap.remove_metadata(sense_data) antisense_data = heatmap.remove_metadata(antisense_data) ################ # croping data # ################ log.info("Croping matrix") if parsed_args.crop: start_col, stop_col = parsed_args.crop sense_data = heatmap.crop_matrix(sense_data, start_col, stop_col) antisense_data = heatmap.crop_matrix(antisense_data, start_col, stop_col) #################### # Normalizing data # #################### log.info("Normalizing data") if parsed_args.norm == "lin": log.info("Linear normalisation") sense_data = heatmap.lin_norm(sense_data) antisense_data = heatmap.lin_norm(antisense_data) elif parsed_args.norm == 'log': log.info("10 base logarithm normalisation") sense_data = heatmap.log_norm(sense_data) antisense_data = heatmap.log_norm(antisense_data) elif parsed_args.norm == 'row': log.info("Linear normalisation by row") sense_data = heatmap.lin_norm_row_by_row(sense_data) antisense_data = heatmap.lin_norm_row_by_row(antisense_data) else: # log+row or row+log log.info("10 base logarithm and normalisation by row") sense_data = heatmap.log_norm_row_by_row(sense_data) antisense_data = heatmap.log_norm_row_by_row(antisense_data) ################## # Drawing figure # ################## log.info("Drawing figure") if parsed_args.size == 'raw': # pillow backend if parsed_args.sense_only: sense_to_compute = ('sense', ) elif parsed_args.antisense_only: sense_to_compute = ('antisense', ) else: sense_to_compute = ('sense', 'antisense') if parsed_args.out: root_name, im_format = os.path.splitext(parsed_args.out) im_format = im_format.lstrip('.') if not im_format: im_format = 'png' else: im_format = 'png' root_name = os.path.splitext(parsed_args.cov_file)[0] out_filename = {} for sense in sense_to_compute: filename = "{filename}.{sense}.{format}".format(filename=root_name, sense=sense, format=im_format) out_filename[sense] = filename if os.path.exists(filename): msg = "{} already exists".format(filename) log.error(msg) raise RuntimeError(msg) for sense in sense_to_compute: log.info("Drawing {}".format(sense)) data = locals()[sense + '_data'] if data is not None: heatmap.draw_raw_image(data, out_filename[sense], color_map, marks=parsed_args.mark) else: log.warning("{} data are empty: skip drawing.".format(sense)) else: # matplotlib backend ########## # layout # ########## if parsed_args.sense_on_left: sense_on = 'left' elif parsed_args.sense_on_right: sense_on = 'right' elif parsed_args.sense_on_top: sense_on = 'top' elif parsed_args.sense_on_bottom: sense_on = 'bottom' else: sense_on = 'top' if parsed_args.title is None: title = os.path.basename(os.path.splitext(parsed_args.cov_file)[0]) else: title = parsed_args.title fig = heatmap.draw_heatmap(sense_data, antisense_data, color_map=color_map, title=title, sense_on=sense_on, size=parsed_args.size, marks=parsed_args.mark) if parsed_args.out: if os.path.exists(parsed_args.out): msg = "The output file: {} already exists.".format(parsed_args.out) log.error(msg) raise RuntimeError(msg) if parsed_args.dpi: fig.savefig(parsed_args.out, dpi=parsed_args.dpi) fig.savefig(parsed_args.out) else: plt.show()
if __name__ == '__name__': main()