Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

#! /usr/bin/env python3 

 

########################################################################### 

# # 

# This file is part of Counter RNAseq Window (craw) package. # 

# # 

# Authors: Bertrand Neron # 

# Copyright (c) 2017-2019 Institut Pasteur (Paris). # 

# see COPYRIGHT file for details. # 

# # 

# craw is free software: you can redistribute it and/or modify # 

# it under the terms of the GNU General Public License as published by # 

# the Free Software Foundation, either version 3 of the License, or # 

# (at your option) any later version. # 

# # 

# craw is distributed in the hope that it will be useful, # 

# but WITHOUT ANY WARRANTY; without even the implied warranty of # 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # 

# See the GNU General Public License for more details. # 

# # 

# You should have received a copy of the GNU General Public License # 

# along with craw (see COPYING file). # 

# If not, see <http://www.gnu.org/licenses/>. # 

# # 

########################################################################### 

 

import os 

import sys 

import argparse 

import itertools 

import logging 

import pysam 

 

import craw 

from craw.util import progress 

from craw import argparse_util 

from craw import annotation, coverage 

from craw.wig import WigParser 

 

 

def positive_int(value): 

""" 

Parse value given by the parser 

 

:param value: the value given by the parser 

:type value: string 

:return: the integer corresponding to the value 

:rtype: int 

:raise: :class:`argparse.ArgumentTypeError` 

""" 

try: 

value = int(value) 

except ValueError: 

raise argparse.ArgumentTypeError("must be a positive integer, got: {}".format(value)) 

if value < 0: 

msg = "must be a positive integer, got: {}".format(value) 

raise argparse.ArgumentTypeError(msg) 

return value 

 

 

def quality_checker(value): 

""" 

Parse value given by the parser 

 

:param value: the value given by the parser 

:type value: string 

:return: the integer >=0 and <=42 corresponding to the value 

:rtype: int 

:raise: :class:`argparse.ArgumentTypeError` if value does not represent a integer >=0 and <=42 

""" 

 

try: 

value = int(value) 

except ValueError: 

raise argparse.ArgumentTypeError("must be a integer between 0 and 42, got: {}".format(value)) 

if not 0 <= value <= 42: 

raise argparse.ArgumentTypeError("must be a integer between 0 and 42, got: {}".format(value)) 

return value 

 

 

def get_result_header(annot_parser, parsed_args): 

""" 

Compute the header for the results. 

the firts lines start with # 

they contains some general information about the craw (version) 

and options used (for tracbility) 

the last line is the header of columns separated by --sep option 

and can be used as header with pandas 

 

:param annot_parser: the annotation parser 

:type annot_parser: :class:`annotation.AnnotationParser` object 

:param parsed_args: the command line argument parsed with argparse 

:type parsed_args: :class:`argparse.Namespace` 

:return: The header of the result file 

:rtype: str 

""" 

def version_infos(): 

header = "# Running Counter RnAseq Window craw_coverage\n" 

commented_ver = get_version_message().rstrip().replace('\n', '\n# ') 

header += """# 

# Version: {} 

# 

# craw_coverage run with the following arguments: 

""".format(commented_ver) 

return header 

 

def options(): 

options = '' 

for a, v in sorted(parsed_args.__dict__.items()): 

if v is None or v is False: 

continue 

else: 

if v is True: 

options += "# --{opt}\n".format(opt=a.replace('_', '-')) 

else: 

options += "# --{opt}={val}\n".format(opt=a.replace('_', '-'), val=v) 

options.rstrip() + '\n' 

return options 

 

def padded_header(): 

metadata = '\t'.join([str(f) for f in annot_parser.header]) 

if parsed_args.start_col: 

max_left, max_right = annot_parser.max() 

pos = '\t'.join(str(p) for p in range(0 - max_left, max_right + 1)) 

else: 

pos = '\t'.join(str(p) for p in range(0 - parsed_args.before, parsed_args.after + 1)) 

s = "sense\t{metadata}\t{pos}".format(metadata=metadata, pos=pos) 

return s 

 

def sum_header(): 

metadata = '\t'.join([str(f) for f in annot_parser.header]) 

return "sense\t{metadata}\tcoverage".format(metadata=metadata) 

 

def resized_header(new_size): 

metadata = '\t'.join([str(f) for f in annot_parser.header]) 

pos = '\t'.join([str(i) for i in range(new_size)]) 

return "sense\t{metadata}\t{pos}".format(metadata=metadata, pos=pos) 

 

header = version_infos() 

header += options() 

 

if parsed_args.justify: 

header += resized_header(parsed_args.justify) 

elif parsed_args.sum: 

header += sum_header() 

else: 

header += padded_header() 

return header 

 

 

def get_version_message(): 

""" 

:return: The human readable CRAW version. 

:rtype: str 

""" 

version_text = craw.get_version_message() 

version_text += """ 

Using: 

- pysam {pysam_ver} (samtools {samtools_ver}) 

- scipy {sp_ver} (only for --justify opt) 

""".format(pysam_ver=pysam.__version__, 

samtools_ver=pysam.__samtools_version__, 

sp_ver=craw.coverage.scipy.__version__) 

return version_text 

 

 

def get_results_file(sense_opt, basename, suffix): 

""" 

 

:param str sense_opt: how to managed the sense and antisense results 

 

* **mixed**: sense and antisense are interleaved in same file 

* **split**: sense and antisense are in separated files 

* **S**: only sense results are write down 

* **AS**: only antisense are write down 

 

:param str basename: the basename of the results file 

:param str suffix: the suffix of the results file 

:return: the file objects where to write sense and antisense results 

:rtype: tuple (`file object` sense, `file object` antisense) 

""" 

if sense_opt == 'S': 

sense_filename = "{filename}.sense.{suffix}".format(filename=basename, suffix=suffix) 

sense = open(sense_filename, 'w') 

antisense = open(os.devnull, 'w') 

elif sense_opt == 'AS': 

sense = open(os.devnull, 'w') 

antisense_filename = "{filename}.antisense.{suffix}".format(filename=basename, suffix=suffix) 

antisense = open(antisense_filename, 'w') 

elif sense_opt == 'split': 

sense_filename = "{filename}.sense.{suffix}".format(filename=basename, suffix=suffix) 

sense = open(sense_filename, 'w') 

antisense_filename = "{filename}.antisense.{suffix}".format(filename=basename, suffix=suffix) 

antisense = open(antisense_filename, 'w') 

else: 

output_filename = "{filename}.{suffix}".format(filename=basename, suffix=suffix) 

sense = open(output_filename, 'w') 

antisense = sense 

return sense, antisense 

 

 

def parse_args(args): 

""" 

 

:param args: The options set on the command line (without the program name) 

:type args: list of string 

:return: 

""" 

parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter) 

input_grp = parser.add_argument_group() 

input_grp.add_argument("-b", "--bam", 

help="""The path of the bam file to analyse.  

--bam option is not compatible with any --wig or --wig-for or --wig-rev options. 

but at least --bam or any of --wig* options is required.""") 

input_grp.add_argument("-w", "--wig", 

help="""The path of the wig file to analyse. 

The file encode the coverage for the both strand. 

The positive coverage ar on the forward strand whereas the negative coverage a located on the reverse one. 

The --wig option is incompatible with both --bam or --wig-for or --wig-reverse options.""") 

input_grp.add_argument("--wig-for", 

metavar='FORWARD WIG', 

help="""The path of a wig file to analyse. 

This file encode the coverage for the forward strand. 

The --wig-for option is incompatible with both --bam or --wig options.""") 

input_grp.add_argument("--wig-rev", 

metavar='REVERSE WIG', 

help="""The path of a wig file to analyse. 

This file encode the coverage for the reverse strand. 

The --wig-rev option is incompatible with both --bam or --wig options.""") 

parser.add_argument("-a", "--annot", 

required=True, 

help="The path of the annotation file (required).") 

parser.add_argument("--qual-thr", 

dest='qual_thr', 

type=quality_checker, 

default=15, 

help="The minimal quality of read mapping to take it in account") 

parser.add_argument("-s", "--suffix", 

default="cov", 

help="The name of the suffix to use for the output file.") 

parser.add_argument('-o', '--output', 

dest='output', 

help="The path of the output (default= base name of annotation file with --suffix)") 

parser.add_argument('--sep', 

default='\t', 

help="the separator use to delimit the annotation fields") 

mutually_exclusive_opt = parser.add_mutually_exclusive_group() 

mutually_exclusive_opt.add_argument('--justify', 

type=positive_int, 

help="to resize all genes coverage to this new size.") 

mutually_exclusive_opt.add_argument('--sum', 

action='store_true', 

default=False, 

help="sum all the coverages on the window.") 

 

region_grp = parser.add_argument_group(title="region of interest", 

description="""Parameters which define regions to compute. 

 

There is 2 way to define regions: 

* all regions have same length. 

* each region have different lengths. 

 

In both case a position of reference must be define (--ref-col). 

 

If all regions have same length: 

 

--window define the number of nucleotide to take in account before and 

after the reference position (the window will be centered on reference) 

--before define the number of nucleotide to take in account before the 

reference position. 

--after define the number of nucleotide to take in account after the 

reference position. 

--before and --after allow to define non centered window. 

 

--after and --before options must be set together and are 

incompatible with --window option. 

 

If all regions have different lengths: 

 

The regions must be specified in the annotation file. 

--start-col define the name of the column in annotation file which define 

the start position of the region to compute. 

--stop-col define the name of the column in annotation file which define 

the stop position of the region to compute. 

""") 

region_grp.add_argument("--ref-col", 

default="position", 

help="The name of the column for the reference position (default: position).") 

region_grp.add_argument("--before", 

type=positive_int, 

help="The number of base to compute after the position of reference.") 

region_grp.add_argument("--after", 

type=positive_int, 

help="The number of base to compute before the position of reference.") 

region_grp.add_argument("--window", 

type=positive_int, 

help="The number of base to compute around the position of reference.") 

region_grp.add_argument("--start-col", 

help="The name of the column to define the start position.") 

region_grp.add_argument("--stop-col", 

help="The name of the column to define the stop position.") 

col_name = parser.add_argument_group(title="specify the name of columns") 

col_name.add_argument("--strand-col", 

default='strand', 

help="Specify the name of the column representing the strand (default: strand)") 

col_name.add_argument("--chr-col", 

default='chromosome', 

help="Specify the name of the column representing the chromosome (default: chromosome)") 

 

parser.add_argument("--sense", 

choices=('S', 'AS', 'split', 'mixed'), 

default='mixed', 

help="compute result only on: " 

"sense (S), " 

"antisense (AS), " 

"on both senses but produce two separated files (split), " 

"or in one file (mixed)." 

"(default: mixed)" 

) 

 

parser.add_argument("--version", 

action=argparse_util.VersionAction, 

version=get_version_message()) 

parser.add_argument("-q", "--quiet", 

action="count", 

default=0, 

help="Reduce verbosity.") 

parser.add_argument("-v", "--verbose", 

action="count", 

default=0, 

help="Increase verbosity.") 

 

parsed_args = parser.parse_args(args) 

 

input_opt_group = (parsed_args.bam, parsed_args.wig, parsed_args.wig_for, parsed_args.wig_rev) 

wig_opt_group = (parsed_args.wig, parsed_args.wig_for, parsed_args.wig_rev) 

 

############################# 

# Check wig and bam options # 

############################# 

if not any(input_opt_group): 

raise argparse.ArgumentError(None, "At least one of these options must be specified" 

" '--bam', '--wig' , '--wig-for', '--wig-rev'.") 

elif all(input_opt_group): 

raise argparse.ArgumentError(None, 

"'--bam', '--wig' , '--wig-for', '--wig-rev' cannot specify at the same time.") 

elif parsed_args.bam and any(wig_opt_group): 

raise argparse.ArgumentError(None, "'--bam' option cannot be specified in the same time as" 

" '--wig', '--wig-for' or '--wig-rev' options.") 

elif parsed_args.wig and any((parsed_args.wig_for, parsed_args.wig_rev)): 

raise argparse.ArgumentError(None, "'--wig' option cannot be specified in the same time as" 

" '--wig-for' or '--wig-rev' options.") 

########################### 

# Checking window options # 

########################### 

group_one = (parsed_args.before, parsed_args.after, parsed_args.window) 

group_two = (parsed_args.start_col, parsed_args.stop_col) 

if all([v is None for v in itertools.chain(group_one, group_two)]): 

raise argparse.ArgumentError(None, "[--window or [--before, --after] or [--start-col, --stop-col] options" 

" must be specified") 

elif any([v is not None for v in group_one]) and any([v is not None for v in group_two]): 

raise argparse.ArgumentError(None, "Options [--before, --after, --window] and [--start-col, --stop-col] " 

"are mutually exclusives.") 

elif all([v is None for v in group_two]): 

if parsed_args.window is None: 

if any([v is None for v in (parsed_args.before, parsed_args.after)]): 

raise argparse.ArgumentError(None, "The two options --after and --before work together." 

" The both options must be specified in same time") 

else: 

pass 

# window is None, before and after are specify 

# => nothing to do 

else: 

# parsed_args.window is not None: 

if any([v is not None for v in (parsed_args.before, parsed_args.after)]): 

raise argparse.ArgumentError(None, "options [--before, --after] and --window are mutually exclusives.") 

else: 

# --before, --after are None 

parsed_args.before = parsed_args.after = parsed_args.window 

elif not all(group_two): 

raise argparse.ArgumentError(None, "The two options --start-col and --stop-col work together. " 

"The both options must be specified in same time") 

return parsed_args 

 

 

def main(args=None, log_level=None): 

""" 

The entrypoint for craw_coverage script 

It will generate a coverage matrix around the position of interest 

and write the results in files 

 

:param args: the arguments and options given on the command line 

:type args: list of string as given by sys.argv without the program name 

:param log_level: the level of logger 

:type log_level: positive int or logging flag logging.DEBUG, logging.INFO, logging.ERROR, logging.CRITICAL 

""" 

args = sys.argv[1:] if args is None else args 

parsed_args = parse_args(args) 

 

if log_level is None: 

verbosity = max(logging.INFO + (parsed_args.quiet - parsed_args.verbose) * 10, 1) 

else: 

verbosity = log_level 

craw.init_logger(verbosity) 

 

####################### 

# Parsing input files # 

####################### 

with open(parsed_args.annot) as annot_file: 

annot_line_number = sum(1 for _ in annot_file) 

annot_parser = annotation.AnnotationParser(parsed_args.annot, parsed_args.ref_col, 

chr_col=parsed_args.chr_col, 

strand_col=parsed_args.strand_col, 

start_col=parsed_args.start_col, 

stop_col=parsed_args.stop_col, 

sep=parsed_args.sep) 

 

if parsed_args.bam: 

# input_data is a samfile 

input_file = parsed_args.bam 

input_data = pysam.AlignmentFile(parsed_args.bam, "rb") 

elif parsed_args.wig: 

# input_data is a wig.Genome object 

input_file = parsed_args.wig 

wig_parser = WigParser(mixed_wig=parsed_args.wig) 

input_data = wig_parser.parse() 

else: 

# input_data is a wig.Genome object 

input_file = parsed_args.wig_for 

wig_parser = WigParser(for_wig=parsed_args.wig_for, rev_wig=parsed_args.wig_rev) 

input_data = wig_parser.parse() 

 

annotations = annot_parser.get_annotations() 

 

############################ 

# checking outputs options # 

############################ 

if not parsed_args.output: 

parsed_args.output = os.path.splitext(input_file)[0] 

out_name = parsed_args.output 

suffix = parsed_args.suffix 

else: 

out_name, suffix = os.path.splitext(parsed_args.output) 

suffix = suffix.strip('.') 

if not suffix: 

suffix = parsed_args.suffix 

 

sense_file, antisense_file = get_results_file(parsed_args.sense, out_name, suffix) 

 

########################### 

# Computing output matrix # 

########################### 

with sense_file, antisense_file: 

header = get_result_header(annot_parser, parsed_args) 

 

if parsed_args.sense in ('S', 'split', 'mixed'): 

# if parsed_args.sense is mixed the sense_file and antisense_file are the same object 

print(header, file=sense_file) 

if parsed_args.sense in ('AS', 'split'): 

print(header, file=antisense_file) 

 

# get the appropriate function according to the input type 

# the 2 functions 

# - get_wig_coverage 

# - get_bam_coverage 

# have exactly the same api 

if parsed_args.justify: 

get_coverage = coverage.resized_coverage_maker(input_data, parsed_args.justify, qual_thr=None) 

elif parsed_args.sum: 

get_coverage = coverage.sum_coverage_maker(input_data, qual_thr=parsed_args.qual_thr) 

else: 

if parsed_args.window is not None: 

max_left = max_right = parsed_args.window 

elif parsed_args.before and parsed_args.after: 

max_left = parsed_args.before 

max_right = parsed_args.after 

else: 

max_left, max_right = annot_parser.max() 

get_coverage = coverage.padded_coverage_maker(input_data, max_left, max_right, qual_thr=parsed_args.qual_thr) 

 

for annot_num, annot_entry in enumerate(annotations, 1): 

if verbosity <= logging.INFO: 

progress(annot_num, annot_line_number) 

if parsed_args.start_col: 

# pos in get_coverage functions are 

# 0 based whereas in annotation they are 1 based 

# start is included, stop is excluded 

start = annot_entry.start - 1 

stop = annot_entry.stop 

else: 

if annot_entry.strand == '+': 

start = annot_entry.ref - parsed_args.before - 1 

stop = annot_entry.ref + parsed_args.after 

else: 

# if feature is on reverse strand 

# the before and after are inverted 

start = annot_entry.ref - parsed_args.after - 1 

stop = annot_entry.ref + parsed_args.before 

forward_cov, reverse_cov = get_coverage(annot_entry, start=start, stop=stop) 

 

sens = 'S' if annot_entry.strand == '+' else 'AS' 

if sens == 'S': 

print(sens, annot_entry, *forward_cov, sep='\t', file=sense_file) 

else: 

print(sens, annot_entry, *forward_cov, sep='\t', file=antisense_file) 

 

sens = 'S' if annot_entry.strand == '-' else 'AS' 

if sens == 'S': 

print(sens, annot_entry, *reverse_cov, sep='\t', file=sense_file) 

else: 

print(sens, annot_entry, *reverse_cov, sep='\t', file=antisense_file) 

 

print(file=sys.stderr) 

 

 

if __name__ == '__main__': 

main()